git / code.ach.gov.ru / scarping / digest-science-publications
commit 05e11187d71c29cafff46c89b8392bafaea852e4
author Administrator <admin@example.com>
date 2021-08-17 15:06:21 +0000
parents 2b26de8a
message
Upload New File
files
| file | add | del |
|---|---|---|
| main.py | +562 | -0 |
patch
diff --git a/main.py b/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d0d58f607be992382cf302d87729c60c4fe1f65
--- /dev/null
+++ b/main.py
@@ -0,0 +1,562 @@
+# Данный код позволяет находить новые статьи с 26 научных ресурсов и последующей отправкой их в чат телеграмма. Происходит сохранение ссылок на уже отправленные статьи в базу MySQL
+import requests
+import telebot
+from bs4 import BeautifulSoup
+import mysql.connector
+#chat_id = id чата в телеграмм
+
+
+def check_url(id, url): # проверка на уже записанные ссылки
+ db = mysql.connector.connect( # соединение с сервиром mysql
+ host='localhost', # имя хоста сервера mysql на компьютере
+ user='root', # mysql user
+ password='1111', #пароль к mysql, если на компьютере нет пароля, то оставить пустым
+ database='journals'# нужно создать базу данных journals с двумя колонками id и url
+ )
+
+ mycursor = db.cursor() # позволяет коду запускать команды в базе данных
+ mycursor.execute(f'SELECT url FROM articles WHERE id={id};') # выполнить команду "SELECT url FROM articles WHERE id={id};" в MySQL (из таблицы articles выбираем колонку url, отсортированную по номерам id)
+ current_url= mycursor.fetchall()[0][0] # получение (retrieve) всех строк из таблицы в виде списка кортежа
+ mycursor.close()
+ db.close()
+ if url == current_url:
+ return 0
+ else:
+ return 1
+
+
+def update_url(id, url): # обновление ссылок в таблице
+ db = mysql.connector.connect(
+ host='localhost',
+ user='root',
+ password='1111',
+ database='journals'
+ )
+
+ mycursor = db.cursor()
+ mycursor.execute(f'UPDATE articles SET url = "{url}" WHERE id = {id};')
+ db.commit()
+ mycursor.close()
+ db.close()
+
+
+def get_html(url): # получить ссылку
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows Mobile 10; Android 10.0; Microsoft; Lumia 950XL) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Mobile Safari/537.36 Edge/40.15254.603'} # зменить User-Agent в соответствии с операционной системой и браузером
+ r = requests.get(url, headers=headers) # отправление запроса на сайт по url, headers (опционально)- словарь заголовок HTTP для отправки определенный url
+ return r.text # получить запрос в виде текста
+
+
+def post(title, abstract, url): # форма ообщения
+ message = f'<b>{title}</b>\n\n{abstract}\n\n{url}' # структура сообщения: сначала название, ниже краткое содержание и ссылка в конце (все с новой строчки)
+ bot.send_message(chat_id='(вместо этого текста вставить чат id)', text=message, parse_mode='HTML') # изменить chat_id, если нужно отпрааить статьи в другой чат.
+
+
+bot = telebot.TeleBot('вместо этого текста вставить токен бота')# токен бота
+
+
+id = 1 # эта чать кода для каждого id примерно одинакова
+html = get_html('https://journals.sagepub.com/home/aje')# сайт, где нужно спарсить статью
+soup = BeautifulSoup(html, 'lxml') # парсинг html и xml
+url = soup.find('div', class_='mostRead').find('div', class_='panel-top').find('div', class_='title').find('a').get('href') # нахождение нужной информации по классам из кода страницы сайта
+url = 'https://journals.sagepub.com' + url
+if check_url(id, url) == 1: # пропустить, если статья уже в базе, добавить, если новая
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('div', class_='publicationContentTitle').find('h1').text
+ try:
+ abstract = soup.find('div', class_='abstractSection').find('p').text
+ except Exception:
+ abstract = '' # отсутствует краткое содержание
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 2
+html = get_html('https://journals.sagepub.com/home/erx')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', class_='mostRead').find('div', class_='panel-top').find('div', class_='title').find('a').get('href')
+url = 'https://journals.sagepub.com' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('div', class_='publicationContentTitle').find('h1').text
+ try:
+ abstract = soup.find('div', class_='abstractSection').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 3
+html = get_html('https://journals.sagepub.com/home/EVI')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', class_='mostRead').find('div', class_='panel-top').find('div', class_='title').find('a').get('href')
+url = 'https://journals.sagepub.com' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('div', class_='publicationContentTitle').find('h1').text
+ try:
+ abstract = soup.find('div', class_='abstractSection').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 4
+html = get_html('https://onlinelibrary.wiley.com/journal/1534875x')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', class_='top-content').find('div', class_='issue-item').find('a').get('href')
+url = 'https://onlinelibrary.wiley.com' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1', class_='citation__title').text
+ try:
+ abstract = soup.find('div', class_='article-section__content').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 5
+html = get_html('https://onlinelibrary.wiley.com/journal/15206688')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', class_='top-content').find('div', class_='issue-item').find('a').get('href')
+url = 'https://onlinelibrary.wiley.com' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1', class_='citation__title').text
+ try:
+ abstract = soup.find('div', class_='article-section__content').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 6
+html = get_html('https://onlinelibrary.wiley.com/journal/15410072')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', class_='top-content').find('div', class_='issue-item').find('a').get('href')
+url = 'https://onlinelibrary.wiley.com' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1', class_='citation__title').text
+ try:
+ abstract = soup.find('div', class_='article-section__content').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 7
+html = get_html('https://academic.oup.com/jpart')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', class_='widget-columns').find('div', class_='widget-dynamic-entry').find('a').get('href')
+url = 'https://academic.oup.com' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1').text
+ try:
+ abstract = soup.find('section', class_='abstract').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 8
+html = get_html('https://scholarworks.umass.edu/pare/')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', class_='article-list').find('div', class_='doc').find_all('p')[1].find('a').get('href')
+print(url)
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('div', id='title').text
+ try:
+ abstract = soup.find('div', id='abstract').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 9
+html = get_html('https://www.sciencedirect.com/journal/evaluation-and-program-planning')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', id='articles-in-press').find('div').find('h3').find('a').get('href')
+url = 'https://www.sciencedirect.com' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1').text
+ try:
+ abstract = soup.find('div', id='abstracts').find('div', class_='author').find('div').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 10
+html = get_html('https://www.journals.elsevier.com/critical-perspectives-on-accounting/recent-articles')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('ul', class_='feed-list__PodFeedList-sc-9zxyh7-0 jttHbr').find('a').get('href')
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1').text
+ try:
+ abstract = soup.find('div', id='abstracts').find('div', class_='author').find('div').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 11
+html = get_html('https://www.cambridge.org/core/journals/journal-of-public-policy')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', id='main-content-area').find('ul', class_='overview').find('h5').find('a').get('href')
+url = 'https://www.cambridge.org' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1').text
+ try:
+ abstract = soup.find('div', class_='abstract-content').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 12
+html = get_html('https://www.sciencedirect.com/journal/journal-of-accounting-literature')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', id='latest-published-articles').find('div').find('h3').find('a').get('href')
+url = 'https://www.sciencedirect.com' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1').text
+ try:
+ abstract = soup.find('div', id='abstracts').find('div', class_='author').find('div').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 13
+html = get_html('https://onlinelibrary.wiley.com/journal/14680408')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', class_='top-content').find('div', class_='issue-item').find('a').get('href')
+url = 'https://onlinelibrary.wiley.com' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1', class_='citation__title').text
+ try:
+ abstract = soup.find('div', class_='article-section__content').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 14
+html = get_html('https://www.journals.elsevier.com/journal-of-policy-modeling/recent-articles')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('ul', class_='feed-list__PodFeedList-sc-9zxyh7-0 jttHbr').find('a').get('href')
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1').text
+ try:
+ abstract = soup.find('div', id='abstracts').find('div', class_='author').find('div').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 15
+html = get_html('https://microsimulation.pub/')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('ol', id='listing').find('li', class_='listing-list__item').find('h4').find('a').get('href')
+url = 'https://microsimulation.pub' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1').text
+ try:
+ abstract = soup.find('section', id='abstract').find('div', class_='article-section__body').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 16
+html = get_html('https://www.exeley.com/journal/evidence_base')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', class_='panel').find('ul', class_='list-text-cont').find('li').find('a').get('href')
+url = 'https://www.exeley.com' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h3').text
+ try:
+ abstract = soup.find('abstract').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 17
+html = get_html('https://www.ingentaconnect.com/content/tpp/ep')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', id='Issu').find('ul', class_='bobby').find('li', class_='rowShade').find('span', class_='accessIcon').find('a').get('href')
+url = 'https://www.ingentaconnect.com' + url
+html = get_html(url)
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', id='Issu').find('div', class_='data').find('a').get('href')
+url = 'https://www.ingentaconnect.com' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1').text
+ try:
+ abstract = soup.find('abstract').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 18
+html = get_html('https://onlinelibrary.wiley.com/journal/15406210')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', class_='top-content').find('div', class_='issue-item').find('a').get('href')
+url = 'https://onlinelibrary.wiley.com' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1', class_='citation__title').text
+ try:
+ abstract = soup.find('div', class_='article-section__content').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 19
+html = get_html('https://www.tandfonline.com/pb/widgets/ajax/graphql/topContentView/ajaxMostRecentController?pbContext=;ctype:string:Journal Content;journal:journal:rpxm20;issue:issue:10.1080/rpxm20.v023.i03;page:string:Table of Contents;csubtype:string:Regular Issue;wgroup:string:Publication Websites;website:website:TFOPB;pageGroup:string:Publication Pages;subPage:string:Current Table of Contents;requestedJournal:journal:rpxm20&widgetId=3666fc42-14a3-451b-af45-ec91a7ed6345')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', class_='literatumMostRecentWidget').find('div', class_='article-card').find('a').get('href')
+url = 'https://www.tandfonline.com' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1').text
+ try:
+ abstract = soup.find('article', class_='article').find('div', class_='abstractSection').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 20
+html = get_html('https://onlinelibrary.wiley.com/journal/17471346')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', class_='top-content').find('div', class_='issue-item').find('a').get('href')
+url = 'https://onlinelibrary.wiley.com' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1', class_='citation__title').text
+ try:
+ abstract = soup.find('div', class_='article-section__content').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 21
+html = get_html('https://academic.oup.com/spp')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', class_='widget-columns').find('div', class_='widget-dynamic-entry').find('a').get('href')
+url = 'https://academic.oup.com' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1').text
+ try:
+ abstract = soup.find('section', class_='abstract').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 22
+html = get_html('https://www.tandfonline.com/pb/widgets/ajax/graphql/topContentView/ajaxMostRecentController?pbContext=;ctype:string:Journal Content;journal:journal:rjpp20;page:string:Table of Contents;requestedJournal:journal:rjpp20;issue:issue:10.1080/rjpp20.v028.i01;csubtype:string:Regular Issue;wgroup:string:Publication Websites;website:website:TFOPB;pageGroup:string:Publication Pages;subPage:string:Current Table of Contents&widgetId=3666fc42-14a3-451b-af45-ec91a7ed6345')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', class_='literatumMostRecentWidget').find('div', class_='article-card').find('a').get('href')
+url = 'https://www.tandfonline.com' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1').text
+ try:
+ abstract = soup.find('article', class_='article').find('div', class_='abstractSection').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 23
+html = get_html('https://www.tandfonline.com/pb/widgets/ajax/graphql/topContentView/ajaxMostRecentController?pbContext=;ctype:string:Journal Content;journal:journal:tiap20;page:string:Table of Contents;csubtype:string:Regular Issue;wgroup:string:Publication Websites;website:website:TFOPB;pageGroup:string:Publication Pages;subPage:string:Current Table of Contents;requestedJournal:journal:tiap20;issue:issue:10.1080/tiap20.v039.i01&widgetId=3666fc42-14a3-451b-af45-ec91a7ed6345')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', class_='literatumMostRecentWidget').find('div', class_='article-card').find('a').get('href')
+url = 'https://www.tandfonline.com' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1').text
+ try:
+ abstract = soup.find('article', class_='article').find('div', class_='abstractSection').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 24
+html = get_html('https://www.tandfonline.com/pb/widgets/ajax/graphql/topContentView/ajaxMostRecentController?pbContext=;issue:issue:10.1080/rjde20.v012.i04;ctype:string:Journal Content;requestedJournal:journal:rjde20;page:string:Table of Contents;csubtype:string:Regular Issue;wgroup:string:Publication Websites;website:website:TFOPB;pageGroup:string:Publication Pages;subPage:string:Current Table of Contents;journal:journal:rjde20&widgetId=3666fc42-14a3-451b-af45-ec91a7ed6345')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', class_='literatumMostRecentWidget').find('div', class_='article-card').find('a').get('href')
+url = 'https://www.tandfonline.com' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1').text
+ try:
+ abstract = soup.find('article', class_='article').find('div', class_='abstractSection').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 25
+html = get_html('https://www.tandfonline.com/pb/widgets/ajax/graphql/topContentView/ajaxMostRecentController?pbContext=;journal:journal:rpas20;requestedJournal:journal:rpas20;ctype:string:Journal Content;issue:issue:10.1080/rpas20.v039.i04;page:string:Table of Contents;csubtype:string:Regular Issue;wgroup:string:Publication Websites;website:website:TFOPB;pageGroup:string:Publication Pages;subPage:string:Current Table of Contents&widgetId=3666fc42-14a3-451b-af45-ec91a7ed6345')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', class_='literatumMostRecentWidget').find('div', class_='article-card').find('a').get('href')
+url = 'https://www.tandfonline.com' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1').text
+ try:
+ abstract = soup.find('article', class_='article').find('div', class_='abstractSection').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+id = 26
+html = get_html('https://www.cambridge.org/core/journals/data-and-policy')
+soup = BeautifulSoup(html, 'lxml')
+url = soup.find('div', id='main-content-area').find('ul', class_='overview').find('h5').find('a').get('href')
+url = 'https://www.cambridge.org' + url
+if check_url(id, url) == 1:
+ html = get_html(url)
+ soup = BeautifulSoup(html, 'lxml')
+ title = soup.find('h1').text
+ try:
+ abstract = soup.find('div', class_='abstract-content').find('p').text
+ except Exception:
+ abstract = ''
+ post(title, url, abstract)
+ update_url(id, url)
+else:
+ pass
+
+
+
+
+
+
+
+
+
+
+
+