Извлечение данных из сертификатов Coursera по науке о данных: парсинг веб-страниц, регулярные выражения и XPath - Fcodenotes

Вот несколько методов, которые вы можете использовать для извлечения данных из сертификата Coursera по науке о данных, а также примеры кода:

Парсинг веб-страниц с помощью Beautiful Soup (Python):

import requests
from bs4 import BeautifulSoup
# Fetch the HTML content of the certificate page
url = 'https://www.coursera.org/certificate/example'
response = requests.get(url)
html_content = response.content
# Parse the HTML content with Beautiful Soup
soup = BeautifulSoup(html_content, 'html.parser')
# Extract the certificate data
certificate_title = soup.find('h1', {'class': 'certificate-title'}).text
issuer_name = soup.find('div', {'class': 'issuer-name'}).text
issue_date = soup.find('div', {'class': 'issue-date'}).text
# Print the extracted data
print("Certificate Title:", certificate_title)
print("Issuer Name:", issuer_name)
print("Issue Date:", issue_date)

Регулярные выражения (Python):

import re
# Assume the certificate data is stored in a string variable called 'html_content'
# Extract the certificate title using regular expressions
certificate_title = re.search(r'<h1 class="certificate-title">(.*?)</h1>', html_content).group(1)
# Extract the issuer name using regular expressions
issuer_name = re.search(r'<div class="issuer-name">(.*?)</div>', html_content).group(1)
# Extract the issue date using regular expressions
issue_date = re.search(r'<div class="issue-date">(.*?)</div>', html_content).group(1)
# Print the extracted data
print("Certificate Title:", certificate_title)
print("Issuer Name:", issuer_name)
print("Issue Date:", issue_date)

Использование XPath с библиотекой lxml (Python):

import requests
from lxml import html
# Fetch the HTML content of the certificate page
url = 'https://www.coursera.org/certificate/example'
response = requests.get(url)
html_content = response.content
# Parse the HTML content with lxml
tree = html.fromstring(html_content)
# Define XPath expressions to extract the certificate data
certificate_title_xpath = '//h1[@class="certificate-title"]/text()'
issuer_name_xpath = '//div[@class="issuer-name"]/text()'
issue_date_xpath = '//div[@class="issue-date"]/text()'
# Extract the certificate data using XPath
certificate_title = tree.xpath(certificate_title_xpath)[0]
issuer_name = tree.xpath(issuer_name_xpath)[0]
issue_date = tree.xpath(issue_date_xpath)[0]
# Print the extracted data
print("Certificate Title:", certificate_title)
print("Issuer Name:", issuer_name)
print("Issue Date:", issue_date)