website
¶requests.get()
compile(pattern, flags=0)
findall()
, search()
, and match()
re.findall()
# retrieve dji
import requests
import re
import numpy as np
import pandas as pd
def retrieve_dji_list():
r1 = requests.get('http://money.cnn.com/data/dow30/')
search_pattern = re.compile('class="wsod_symbol">(.*?)<\/a>.*<span.*">(.*?)<\/span>.*\n.*class="wsod_stream">(.*?)<\/span>')
dji_list_in_text = re.findall(search_pattern,r1.text)
dji_list = []
for item in dji_list_in_text:
dji_list.append({'code':item[0],'name':item[1],'price':float(item[2])})
return dji_list
dji_list = retrieve_dji_list()
djidf = pd.DataFrame(dji_list)
print(djidf)
# craw quotesdf
import requests
import re
import json
import pandas as pd
def retrieve_quotes_historical(stock_code):
quotes = []
url = 'https://finance.yahoo.com/quote/%s/history?p=%s' % (stock_code, stock_code)
try:
r = requests.get(url)
except ConnectionError as err:
print(err)
m = re.findall('"HistoricalPriceStore":{"prices":(.*?),"isPending"', r.text)
if m:
quotes = json.loads(m[0]) # m = ['[{...},{...},...]']
quotes = quotes[::-1]
return [item for item in quotes if 'type' not in item]
quotes = retrieve_quotes_historical('AXP')
quotesdf_ori = pd.DataFrame(quotes)
quotesdf = quotesdf_ori.drop(['adjclose'], axis = 1)
print(quotesdf)
import time
import json
import requests
import pandas as pd
import numpy as np
def getDaily():
url = 'https://view.inews.qq.com/g2/getOnsInfo?name=wuwei_ww_cn_day_counts&callback=&_=%d'%int(time.time()*1000)
data = json.loads(requests.get(url=url).json()['data'])
data.sort(key=lambda x:x['date'])
data = pd.DataFrame(data)
data['date'] = data['date'].map(lambda x: '2020%s'%(x.replace('/','')))
return data
def getCity():
distribution_data = pd.DataFrame()
url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d'%int(time.time()*1000)
data = json.loads(requests.get(url=url).json()['data'])['areaTree'][0]['children']
province_num = len(data)
for i in range(province_num):
province_data = data[i]
cities_data = province_data['children']
cities_num = len(cities_data)
for j in range(cities_num):
city_data = cities_data[j]
temp = pd.DataFrame(city_data['total'],index=[0])
del temp['suspect']
temp['city'] = city_data['name']
temp['province'] = province_data['name']
distribution_data = pd.concat([distribution_data,temp])
return distribution_data
if __name__ == '__main__':
daily_data = getDaily()
distribution_data = getCity()
daily_data.to_csv(r'C:\py2020\crawler\covid19_daily.csv',index=False)
distribution_data.to_csv(r'C:\py2020\crawler\covid19_city.csv',index=False)
import time
import json
import requests
import pandas as pd
import numpy as np
def getCovid19Foreign(fd):
url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_foreign&callback=&_=%d'%int(time.time()*1000)
data = json.loads(requests.get(url=url).json()['data'])['foreignList']
for nt in data:
fd.append([nt['name'],nt['date'],nt['confirmAdd'],nt['confirm'],nt['dead'],nt['heal'],nt['nowConfirm'],
nt['confirmCompare'],nt['nowConfirmCompare'],nt['healCompare'],nt['deadCompare']])
def saveCovid19List(covid19List,output_file):
covid19df = pd.DataFrame(covid19List)
covid19df.to_csv(output_file,index=False)
if __name__ == '__main__':
covid19fo = []
getCovid19Foreign(covid19fo)
output_file = 'C:/py2020/crawler/Covid19Info_foreign.csv'
saveCovid19List(covid19fo,output_file)
print(covid19fo[:2])
from bs4 import BeautifulSoup
import bs4
import requests
import pandas as pd
def getHTTPText(url):
try:
r = requests.get(url,timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "Error"
def fillCovid19List(ulist,html):
soup = BeautifulSoup(html,"html.parser")
h3 = soup.find_all("h3")
del h3[0]
tbody = soup.find_all("tbody")
tr = tbody[0]
tds = tr('td')
ulist.append(['All states',tds[0].string,tds[1].string,tds[2].string,tds[3].string,tds[4].string])
del tbody[0]
for tr, a in zip(tbody,h3):
if isinstance(tr,bs4.element.Tag) and isinstance(a,bs4.element.Tag):
tds = tr('td')
atag = a('a')
ulist.append([atag[0].string,tds[0].string,tds[1].string,tds[2].string,tds[10].string,tds[11].string])
def printCovid19List(ulist):
tpl = "{:<15}\t{:<15}\t{:<10}\t{:<10}\t{:<10}\t{:<10}"
print(tpl.format("State","Positive","Negative","Pending","Deaths","Total test results"))
for i in range(len(ulist)):
u = ulist[i]
print(tpl.format(u[0],u[1],u[2],u[3],u[4],u[5]))
def saveCovid19List(covid19List,output_file):
covid19df = pd.DataFrame(covid19List)
covid19df.to_csv(output_file,index=False)
def main():
covid19fo = []
url = 'https://covidtracking.com/data/?from=groupmessage&isappinstalled=0'
output_file = 'C:/py2020/crawler/Covid19Info_US.csv'
html = getHTTPText(url)
fillCovid19List(covid19fo,html)
printCovid19List(covid19fo)
saveCovid19List(covid19fo,output_file)
main()
import pandas_datareader.data as web
df = web.DataReader('AXP','stooq')
df.head(5)
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import pandas_datareader.data as web
start = datetime.datetime(2019,10,1)
end = datetime.date.today()
stock = web.DataReader("600797.SS", "yahoo", start, end)
stock.head()
#plt.plot(stock_info['Close'], 'g')
#plt.show()
from pandas_datareader import data
import matplotlib.pyplot as plt
import pandas as pd
stock_code = input("美股直接输入股票代码如GOOG \n港股输入代码+对应股市,如腾讯:0700.hk \n国内股票需要区分上证和深证,股票代码后面加.ss或者.sz\n请输入你要查询的股票代码:")
start_date = "2019-01-01"
end_date = "2020-01-01"
stock_info = data.get_data_yahoo(stock_code, start_date, end_date)
print(stock_info.head())
stock_info.to_excel('%s.xlsx'%stock_code)
stock_info.to_csv('%s.csv'%stock_code)
import pandas as pd
import requests
import re
import json
from pandas_datareader import data
def retrieve_quotes_historical(stock_code):
quotes = []
url = 'http://finance.yahoo.com/quote/%s/history?p=%s' % (stock_code, stock_code)
try:
r = requests.get(url)
except ConnectionError as err:
print(err)
m = re.findall('"HistoricalPriceStore":{"prices":(.*?),"isPending"', r.text)
if m:
quotes = json.loads(m[0])
quotes = quotes[::-1]
quotes = retrieve_quotes_historical('AXP')
df = pd.DataFrame(quotes)
df.head()
# df.to_cvs('quotes1.cvs')
from sklearn import datasets
dir(datasets)
from sklearn import datasets
iris = datasets.load_iris()
iris.data[1:5]
nr = np.size(iris.data,0)
nc = np.size(iris.data,1)
print(nr,nc)
iris.target
import numpy as np
from scipy.cluster.vq import vq, kmeans, whiten
whiten = whiten(iris.data) # calculate std row-wise
centroids,_ = kmeans(whiten, 3) # cluster
result,_= vq(whiten, centroids)
print(result)