Вот несколько методов, которые вы можете использовать для извлечения данных из сертификата Coursera по науке о данных, а также примеры кода:
-
Парсинг веб-страниц с помощью Beautiful Soup (Python):
import requests from bs4 import BeautifulSoup # Fetch the HTML content of the certificate page url = 'https://www.coursera.org/certificate/example' response = requests.get(url) html_content = response.content # Parse the HTML content with Beautiful Soup soup = BeautifulSoup(html_content, 'html.parser') # Extract the certificate data certificate_title = soup.find('h1', {'class': 'certificate-title'}).text issuer_name = soup.find('div', {'class': 'issuer-name'}).text issue_date = soup.find('div', {'class': 'issue-date'}).text # Print the extracted data print("Certificate Title:", certificate_title) print("Issuer Name:", issuer_name) print("Issue Date:", issue_date) -
Регулярные выражения (Python):
import re # Assume the certificate data is stored in a string variable called 'html_content' # Extract the certificate title using regular expressions certificate_title = re.search(r'<h1 class="certificate-title">(.*?)</h1>', html_content).group(1) # Extract the issuer name using regular expressions issuer_name = re.search(r'<div class="issuer-name">(.*?)</div>', html_content).group(1) # Extract the issue date using regular expressions issue_date = re.search(r'<div class="issue-date">(.*?)</div>', html_content).group(1) # Print the extracted data print("Certificate Title:", certificate_title) print("Issuer Name:", issuer_name) print("Issue Date:", issue_date) -
Использование XPath с библиотекой lxml (Python):
import requests from lxml import html # Fetch the HTML content of the certificate page url = 'https://www.coursera.org/certificate/example' response = requests.get(url) html_content = response.content # Parse the HTML content with lxml tree = html.fromstring(html_content) # Define XPath expressions to extract the certificate data certificate_title_xpath = '//h1[@class="certificate-title"]/text()' issuer_name_xpath = '//div[@class="issuer-name"]/text()' issue_date_xpath = '//div[@class="issue-date"]/text()' # Extract the certificate data using XPath certificate_title = tree.xpath(certificate_title_xpath)[0] issuer_name = tree.xpath(issuer_name_xpath)[0] issue_date = tree.xpath(issue_date_xpath)[0] # Print the extracted data print("Certificate Title:", certificate_title) print("Issuer Name:", issuer_name) print("Issue Date:", issue_date)