-
Notifications
You must be signed in to change notification settings - Fork 391
/
Copy pathstock_news.py
156 lines (133 loc) · 4.81 KB
/
stock_news.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
import requests
import pandas as pd
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import os
# Downloading today data from SGX using SGX API
today = datetime.now()
start_date = today-timedelta(1)
start_date = start_date.strftime('%Y%m%d')# Format must be in: '20190808'
end_date = today.strftime('%Y%m%d')
default_folder = './data'
# Start downloading data
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36"}
url = f'https://api.sgx.com/announcements/v1.0/?periodstart={start_date}_160000&periodend={end_date}_155959&pagestart=0&pagesize=20'
res = requests.get(url, headers)
# Store the downloaded data into dataframe (for easier access later)
json_data = res.json()['data']
df = pd.DataFrame(json_data)
# Assuming we are interested in the updates related to "LODHA DEVELOPERS INTERNATIONAL LIMITED"
stock_name = 'LODHA DEVELOPERS INTERNATIONAL LIMITED'
# Filtered out those unwanted company updates information
df = df[df['issuer_name']==stock_name]
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
import requests
import pandas as pd
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import os
def get_pdf(url, folder, filename):
"""
Downloading the pdf files associated with the specified URL
and output the downloaded pdf files into specified directory
"""
based_url = 'https://links.sgx.com'
pdf_url = based_url+url
res = requests.get(pdf_url)
# Create the folder if not exist
if not os.path.exists(folder):
os.mkdir(folder)
# Output pdf file
with open(os.path.join(folder, filename), 'wb') as f:
f.write(res.content)
return
# Get the link to pdf files for each records found
for idx, row in df.iterrows():
res = requests.get(row['url'])
soup = BeautifulSoup(res.text)
# folder = row['issuer_name'].replace(' ', '_')
# subfolder = row['category_name'].replace(' ', '_')
try:
filename = soup.find('a').text
part_url = soup.find('a')['href']
get_pdf(part_url, default_folder, filename)
except:
# No attached pdf file to be downloaded on specified link
pass
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
import requests
import pandas as pd
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import os
gauth = GoogleAuth()
# Try to load saved client credentials
gauth.LoadCredentialsFile("mycreds.txt")
if gauth.credentials is None:
# Authenticate if they're not there
gauth.LocalWebserverAuth()
elif gauth.access_token_expired:
# Refresh them if expired
gauth.Refresh()
else:
# Initialize the saved creds
gauth.Authorize()
# Save the current credentials to a file
gauth.SaveCredentialsFile("mycreds.txt")
# Authentication
drive = GoogleDrive(gauth)
# Input destination folder id which you want to upload to.
current_google_folder_id = 'xxxxx'
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
import requests
import pandas as pd
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import os
def create_folder(drive, folder_name, parent_folder_id):
"""
Create folder on Google Drive
"""
folder_metadata = {
'title': folder_name,
# Define the file type as folder
'mimeType': 'application/vnd.google-apps.folder',
# ID of the parent folder
'parents': [{"kind": "drive#fileLink", "id": parent_folder_id}]
}
folder = drive.CreateFile(folder_metadata)
folder.Upload()
# Return folder informations
return folder['id']
def upload_file(drive, folder_id, path_to_file, file_title):
file = drive.CreateFile({"parents": [{"kind": "drive#fileLink", "id": folder_id}]})
file.SetContentFile(path_to_file)
file['title'] = file_title
file.Upload()
# Get File List
file_list = drive.ListFile(
{'q': "'{}' in parents and trashed=false".format(current_google_folder_id)}
).GetList()
# Get all title of the file list
if len(file_list) == 0:
folder_id = create_folder(drive, stock_name, current_google_folder_id)
else:
# Check if folder already exist
for index, file in enumerate(file_list):
if file['title'] == stock_name:
folder_id = file['id']
break
elif index == len(file_list) - 1:
folder_id = create_folder(drive, stock_name, current_google_folder_id)
else:
pass
# Loop through all the files, and upload them one by one to folder
for f in os.listdir(default_folder):
upload_file(drive, folder_id, os.path.join(default_folder, f), f)
for file in os.listdir(default_folder):
os.remove(os.path.join(default_folder, file))