Python crawler - crawl images and store them in MySQL database

Empire my love 2022-05-14 14:55:02 阅读数:208

pythoncrawlercrawlimagesstore

One 、 Get URL , Analysis website

(1)、for Loop to get the data of different pages

(2)、 obtain <a> The URL in the tag ( reason : The resolution of the picture outside is too small , Enter the website corresponding to the picture to find the picture with higher resolution )

# url- website https://www.moyublog.com/95-2-0-0.html
for i in range(1):
str_value = str(i)
# the number of pages
url = "https://www.moyublog.com/95-2-0-" + str_value + ".html"
# Browser type - sogou
Search_engine = {"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"}
# Send a request , Get URL HTML, To text
Type_conversion = requests.get(url=url, headers=Search_engine, timeout=None).text
# Definition BeautifulSoup, Analysis website HTML
bs = BeautifulSoup(Type_conversion, 'html.parser')
# Get specified div
scope_div = bs.find('div', attrs={'class': 'slist'})
# Get specified div All in a label
scope_a = scope_div.findAll('a')
# print(scope_a)

(3)、 Enter the website of the corresponding picture , Parse the corresponding data  

# Loop to get the specified div All in a label
for int_A in scope_a:
# Clear away unwanted a label
if len(int_A) % 2 == 0:
# Get fixed a Links in tags
get_href = int_A.get("href")
# print(get_href)
# new url- website
url_A = get_href
# Send a request , Get URL HTML, To text, Send a request , Get URL HTML, To text,
Type_conversion_A = requests.get(url=url_A, headers=Search_engine).text.encode('iso-8859-1').decode(
'utf-8')
# Define a new BeautifulSoup, Analysis website HTML
bs_A = BeautifulSoup(Type_conversion_A, 'html.parser')
# Define... In the new URL div label
A_div = bs_A.find('div', attrs={'class': 'photo-pic'})
# Get new definition div All in img label
A_img = A_div.findAll('img')
# print(A_img)

Two 、 Download and save pictures

(1)、 Determine and create folders

folder = "image"
if not os.path.exists(folder):
print(" file does not exist , Created !")
os.mkdir(folder)
else:
print(" Start downloading pictures ")

(2)、 Open the folder, download the picture and name it  

# obtain img In the tag src Properties
get_a_src = a_img.get("src")
# obtain img In the tag title Properties
get_a_title = a_img.get("title")
# print(get_a_title)
# Define what to download
download = requests.get(get_a_src)
# Loop open file to create jpg
with open("image/" + get_a_title + ".jpg", mode="wb") as f:
# Start the download
f.write(download.content)
if j > 0:
# percentage
print(" The first ", i - j, " Picture downloading -", format((i - j) / 25, '.2%'))
else:
print(" The first ", i, " Picture downloading -", format(i / 25, '.2%'))
# pause
time.sleep(0.001)

3、 ... and 、 The picture is converted to binary , preservation MySQL In the database

(1)、 Connect MySQL database

def MySQL_connect(picture_id, picture_name, picture_href, picture_binary_system):
# Open database connection
connection = pymysql.connect(host="****", user="****", password="****", port=****, database="****",
charset='utf8')
# MySQL sentence
sql = 'INSERT INTO picture(picture_id,picture_name,picture_href,picture_binary_system) VALUES (%s,%s,%s,%s)'
# Get tag
cursor = connection.cursor()
try:
# perform SQL sentence
cursor.execute(sql, [picture_id, picture_name, picture_href, picture_binary_system])
# Commit transaction
connection.commit()
except Exception as e:
print(str(e))
# There are abnormal , Roll back the transaction
connection.rollback()
# Free memory
cursor.close()
connection.close()

(2)、 Loop picture , Parsing binary , Storage MySQL In the database

 Ring open picture
with open("image/" + get_a_title + ".jpg", "rb") as f:
# Convert to binary format , And use base64 To encrypt
base64_data = base64.b64encode(f.read())
a = str(i)
b = (j + 1)
c = str(b)
# MySQL
MySQL_connect(c + "-" + a, get_a_src, get_a_title, base64_data)

Four 、 Attach complete code

import os
import time
import requests
import pymysql
import base64
from bs4 import BeautifulSoup
# Determine whether the folder exists , Create if it does not exist
def Judge_folder():
folder = "image"
if not os.path.exists(folder):
print(" file does not exist , Created !")
os.mkdir(folder)
else:
print(" Start downloading pictures ")
def Url_parsing():
# url- website https://www.moyublog.com/95-2-0-0.html 148
# range( the number of pages )
for i in range(1):
str_value = str(i)
# the number of pages
url = "https://www.moyublog.com/95-2-0-" + str_value + ".html"
# Browser type - sogou
Search_engine = {"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"}
# Send a request , Get URL HTML, To text
Type_conversion = requests.get(url=url, headers=Search_engine, timeout=None).text
# Definition BeautifulSoup, Analysis website HTML
bs = BeautifulSoup(Type_conversion, 'html.parser')
# Get specified div
scope_div = bs.find('div', attrs={'class': 'slist'})
# Get specified div All in a label
scope_a = scope_div.findAll('a')
# print(scope_a)
j = i
# Loop to get the specified div All in a label
for int_A in scope_a:
# Clear away unwanted a label
if len(int_A) % 2 == 0:
# Get fixed a Links in tags
get_href = int_A.get("href")
# print(get_href)
# new url- website
url_A = get_href
# Send a request , Get URL HTML, To text, Send a request , Get URL HTML, To text,
Type_conversion_A = requests.get(url=url_A, headers=Search_engine).text.encode('iso-8859-1').decode(
'utf-8')
# Define a new BeautifulSoup, Analysis website HTML
bs_A = BeautifulSoup(Type_conversion_A, 'html.parser')
# Define... In the new URL div label
A_div = bs_A.find('div', attrs={'class': 'photo-pic'})
# Get new definition div All in img label
A_img = A_div.findAll('img')
# print(A_img)title
# Cycle to get div All in img label
for a_img in A_img:
# percentage
i += 1
# obtain img In the tag src Properties
get_a_src = a_img.get("src")
# obtain img In the tag title Properties
get_a_title = a_img.get("title")
# print(get_a_title)
# Define what to download
download = requests.get(get_a_src)
# Loop open file to create jpg
with open("image/" + get_a_title + ".jpg", mode="wb") as f:
# Start the download
f.write(download.content)
if j > 0:
# percentage
print(" The first ", i - j, " Picture downloading -", format((i - j) / 25, '.2%'))
else:
print(" The first ", i, " Picture downloading -", format(i / 25, '.2%'))
# binary.close()
# Loop open picture
with open("image/" + get_a_title + ".jpg", "rb") as f:
# Convert to binary format , And use base64 To encrypt
base64_data = base64.b64encode(f.read())
a = str(i)
b = (j + 1)
c = str(b)
# MySQL
MySQL_connect(c + "-" + a, get_a_src, get_a_title, base64_data)
print(" The first " + c + " page - The first " + a + " This picture has been saved in MySQL In the database !")
# pause
time.sleep(0.001)
print(" The first ", j + 1, " page , Download complete !")
def Exception_error():
Judge_folder()
try:
Url_parsing()
except KeyboardInterrupt:
print('\n Program terminated . . . . .')
print(' end !')
def MySQL_connect(picture_id, picture_name, picture_href, picture_binary_system):
# Open database connection
connection = pymysql.connect(host="****", user="****", password="****", port=****, database="****",
charset='utf8')
# MySQL sentence
sql = 'INSERT INTO picture(picture_id,picture_name,picture_href,picture_binary_system) VALUES (%s,%s,%s,%s)'
# Get tag
cursor = connection.cursor()
try:
# perform SQL sentence
cursor.execute(sql, [picture_id, picture_name, picture_href, picture_binary_system])
# Commit transaction
connection.commit()
except Exception as e:
print(str(e))
# There are abnormal , Roll back the transaction
connection.rollback()
# Free memory
cursor.close()
connection.close()
if __name__ == '__main__':
Exception_error()

版权声明:本文为[Empire my love]所创,转载请带上原文链接,感谢。 https://javamana.com/2022/134/202205141446234759.html