Catch data from website

  • requests.get()
  • Syntax of compile(pattern, flags=0)
    • Compile a regular expression pattern, returning a pattern object.
    • Used with findall(), search(), and match()
  • re.findall()
In [7]:
# retrieve dji
import requests
import re
import numpy as np
import pandas as pd

def retrieve_dji_list():
    r1 = requests.get('http://money.cnn.com/data/dow30/')
    search_pattern = re.compile('class="wsod_symbol">(.*?)<\/a>.*<span.*">(.*?)<\/span>.*\n.*class="wsod_stream">(.*?)<\/span>')
    dji_list_in_text = re.findall(search_pattern,r1.text)
    dji_list = []
    for item in dji_list_in_text:
        dji_list.append({'code':item[0],'name':item[1],'price':float(item[2])})
    return dji_list
    
dji_list = retrieve_dji_list()
djidf = pd.DataFrame(dji_list)
print(djidf)
    
    code                     name   price
0    MMM                       3M  160.09
1    AXP         American Express  134.46
2   AAPL                    Apple  326.12
3     BA                   Boeing  345.02
4    CAT              Caterpillar  140.47
5    CVX                  Chevron  111.91
6   CSCO                    Cisco   47.59
7     KO                Coca-Cola   59.80
8    DIS                   Disney  141.75
9    DOW             Dow Chemical   48.30
10   XOM              Exxon Mobil   60.79
11    GS            Goldman Sachs  238.62
12    HD               Home Depot  242.91
13   IBM                      IBM  154.76
14  INTC                    Intel   67.59
15   JNJ        Johnson & Johnson  150.46
16   JPM           JPMorgan Chase  138.08
17   MCD               McDonald's  218.11
18   MRK                    Merck   82.35
19  MSFT                Microsoft  184.40
20   NKE                     Nike  103.60
21   PFE                   Pfizer   37.15
22    PG         Procter & Gamble  125.19
23   TRV  Travelers Companies Inc  136.95
24   UTX      United Technologies  153.80
25   UNH             UnitedHealth  303.98
26    VZ                  Verizon   58.60
27     V                     Visa  208.11
28   WMT                 Wal-Mart  117.44
29   WBA                 Walgreen   53.58
In [6]:
# craw quotesdf
import requests
import re
import json
import pandas as pd
 
def retrieve_quotes_historical(stock_code):
    quotes = []
    url = 'https://finance.yahoo.com/quote/%s/history?p=%s' % (stock_code, stock_code)
    try:
        r = requests.get(url)
    except ConnectionError as err:
        print(err)  
    m = re.findall('"HistoricalPriceStore":{"prices":(.*?),"isPending"', r.text)
    if m:
        quotes = json.loads(m[0])     # m = ['[{...},{...},...]']
        quotes = quotes[::-1]
    return  [item for item in quotes if 'type' not in item]
 
quotes = retrieve_quotes_historical('AXP')
quotesdf_ori = pd.DataFrame(quotes)
quotesdf = quotesdf_ori.drop(['adjclose'], axis = 1)
print(quotesdf)
          close        date        high         low        open    volume
0    110.699997  1554730200  110.849998  109.919998  110.620003   4776600
1    109.849998  1554816600  110.309998  109.489998  110.010002   2869300
2    110.160004  1554903000  110.480003  109.519997  110.480003   2650100
3    109.849998  1554989400  110.830002  109.419998  110.510002   2320600
4    110.910004  1555075800  111.550003  110.449997  110.820000   2316900
5    110.529999  1555335000  110.970001  110.250000  110.919998   2285000
6    111.879997  1555421400  112.029999  110.940002  111.089996   2523400
7    111.760002  1555507800  112.500000  111.250000  112.309998   3799900
8    113.669998  1555594200  114.139999  111.320000  112.250000   6186800
9    112.330002  1555939800  113.400002  112.230003  112.599998   3174400
10   113.970001  1556026200  114.040001  111.900002  112.190002   3342800
11   114.019997  1556112600  114.360001  113.419998  113.760002   2318400
12   115.879997  1556199000  116.500000  114.750000  115.230003   4595200
13   117.599998  1556285400  117.730003  116.099998  116.589996   2990600
14   116.760002  1556544600  117.699997  116.550003  117.239998   3096200
15   117.230003  1556631000  117.349998  116.099998  117.050003   2387900
16   117.019997  1556717400  117.989998  116.930000  117.470001   2450500
17   117.250000  1556803800  117.650002  116.680000  117.000000   2538700
18   119.349998  1556890200  119.660004  117.629997  117.910004   2856200
19   119.339996  1557149400  119.610001  117.050003  117.260002   2924500
20   118.220001  1557235800  119.410004  117.650002  118.370003   6282300
21   117.489998  1557322200  118.110001  116.820000  117.830002   4037500
22   117.260002  1557408600  117.290001  115.379997  116.150002   3219600
23   118.459999  1557495000  118.930000  116.209999  116.860001   2435100
24   115.900002  1557754200  116.889999  115.620003  116.419998   3710600
25   117.010002  1557840600  117.730003  115.769997  115.980003   2460700
26   117.660004  1557927000  118.230003  115.900002  116.180000   2711400
27   119.839996  1558013400  120.349998  117.769997  117.769997   4143500
28   119.070000  1558099800  119.589996  118.510002  118.599998   5273600
29   119.839996  1558359000  120.320000  118.620003  118.830002   2765600
..          ...         ...         ...         ...         ...       ...
221  128.190002  1582554600  130.960007  127.230003  129.429993   6329300
222  120.900002  1582641000  127.989998  119.910004  127.709999   7628100
223  118.500000  1582727400  122.430000  118.470001  121.269997   5268000
224  112.809998  1582813800  117.480003  112.779999  115.900002   8279800
225  109.930000  1582900200  111.029999  107.000000  109.050003  11363600
226  113.870003  1583159400  113.989998  108.589996  111.110001   9851600
227  108.010002  1583245800  116.400002  107.000000  115.019997   9285000
228  115.699997  1583332200  115.790001  109.250000  110.250000   7470900
229  110.940002  1583418600  112.589996  109.699997  111.620003  12060600
230  108.239998  1583505000  108.769997  103.790001  105.730003   9897700
231   98.290001  1583760600  101.440002   96.940002  100.870003  12266300
232  103.300003  1583847000  103.370003   97.449997  101.930000   9793200
233   95.379997  1583933400  100.959999   93.870003  100.300003   8822900
234   83.529999  1584019800   89.610001   81.809998   87.660004  12206600
235   99.599998  1584106200  100.080002   84.300003   90.379997  12680400
236   85.870003  1584365400   93.180000   80.370003   82.440002  15072400
237   86.580002  1584451800   88.940002   83.220001   87.610001  10512200
238   73.750000  1584538200   81.519997   67.000000   80.019997  14359300
239   77.070000  1584624600   77.889999   69.790001   72.540001  12330400
240   74.120003  1584711000   82.440002   72.559998   78.980003  12208600
241   68.959999  1584970200   75.419998   68.120003   72.820000   8907600
242   84.050003  1585056600   84.769997   71.699997   73.510002  10593900
243   90.169998  1585143000   96.459999   85.320000   87.610001  11046500
244   93.290001  1585229400   99.690002   90.839996   92.239998   9459400
245   88.730003  1585315800   91.970001   85.820000   87.940002   6999800
246   90.269997  1585575000   91.389999   85.269997   88.510002   5565000
247   85.610001  1585661400   90.900002   85.070000   89.940002   6075900
248   77.839996  1585747800   81.309998   77.000000   80.650002   8313400
249   76.660004  1585834200   79.120003   75.830002   77.059998   8338100
250   73.599998  1585920600   76.660004   72.610001   76.000000  10020700

[251 rows x 6 columns]

Examples to Craw COVID-19 data

In [3]:
import time
import json
import requests
import pandas as pd
import numpy as np


def getDaily():    
    url = 'https://view.inews.qq.com/g2/getOnsInfo?name=wuwei_ww_cn_day_counts&callback=&_=%d'%int(time.time()*1000)
    data = json.loads(requests.get(url=url).json()['data'])
    data.sort(key=lambda x:x['date'])
    data = pd.DataFrame(data)
    data['date'] = data['date'].map(lambda x: '2020%s'%(x.replace('/','')))
    return data

def getCity():    
    distribution_data = pd.DataFrame()
    url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d'%int(time.time()*1000)
    data = json.loads(requests.get(url=url).json()['data'])['areaTree'][0]['children']
    province_num = len(data)
    for i in range(province_num):
        province_data = data[i]
        cities_data = province_data['children']
        cities_num = len(cities_data)
        for j in range(cities_num):
            city_data = cities_data[j]
            temp = pd.DataFrame(city_data['total'],index=[0])
            del temp['suspect']
            temp['city'] = city_data['name']
            temp['province'] = province_data['name']
            distribution_data = pd.concat([distribution_data,temp])
    return distribution_data

if __name__ == '__main__':
    daily_data = getDaily()
    distribution_data = getCity()
    daily_data.to_csv(r'C:\py2020\crawler\covid19_daily.csv',index=False)
    distribution_data.to_csv(r'C:\py2020\crawler\covid19_city.csv',index=False)
In [129]:
import time
import json
import requests
import pandas as pd
import numpy as np


def getCovid19Foreign(fd):    
    url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_foreign&callback=&_=%d'%int(time.time()*1000)
    data = json.loads(requests.get(url=url).json()['data'])['foreignList']
    for nt in data:
        fd.append([nt['name'],nt['date'],nt['confirmAdd'],nt['confirm'],nt['dead'],nt['heal'],nt['nowConfirm'],
                  nt['confirmCompare'],nt['nowConfirmCompare'],nt['healCompare'],nt['deadCompare']])   
        
def saveCovid19List(covid19List,output_file):
    covid19df = pd.DataFrame(covid19List)
    covid19df.to_csv(output_file,index=False)
    
if __name__ == '__main__':
    covid19fo = []
    getCovid19Foreign(covid19fo)
    output_file = 'C:/py2020/crawler/Covid19Info_foreign.csv'
    saveCovid19List(covid19fo,output_file)
    print(covid19fo[:2])
[['西班牙', '04.10', 5002, 153222, 15447, 52165, 85610, 68027, 22464, 37456, 8107], ['美国', '03.30', 0, 143071, 2513, 4856, 135702, 0, 0, 0, 0]]
In [85]:
from bs4 import BeautifulSoup
import bs4
import requests
import pandas as pd

def getHTTPText(url):
    try:
        r = requests.get(url,timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "Error"
    
def fillCovid19List(ulist,html):
    soup = BeautifulSoup(html,"html.parser")    
    h3 = soup.find_all("h3")
    del h3[0]
    
    tbody = soup.find_all("tbody")
    tr = tbody[0]    
    tds = tr('td')    
    ulist.append(['All states',tds[0].string,tds[1].string,tds[2].string,tds[3].string,tds[4].string]) 
    del tbody[0]
    
    for tr, a in zip(tbody,h3):
        if isinstance(tr,bs4.element.Tag) and isinstance(a,bs4.element.Tag):
            tds = tr('td')
            atag = a('a')
            ulist.append([atag[0].string,tds[0].string,tds[1].string,tds[2].string,tds[10].string,tds[11].string]) 
            
            
def printCovid19List(ulist):
    tpl = "{:<15}\t{:<15}\t{:<10}\t{:<10}\t{:<10}\t{:<10}"
    print(tpl.format("State","Positive","Negative","Pending","Deaths","Total test results"))
    for i in range(len(ulist)):
        u = ulist[i]
        print(tpl.format(u[0],u[1],u[2],u[3],u[4],u[5]))
    
def saveCovid19List(covid19List,output_file):
    covid19df = pd.DataFrame(covid19List)
    covid19df.to_csv(output_file,index=False)
    
def main():
    covid19fo = []
    url = 'https://covidtracking.com/data/?from=groupmessage&isappinstalled=0'
    output_file = 'C:/py2020/crawler/Covid19Info_US.csv'
    html = getHTTPText(url)
    fillCovid19List(covid19fo,html)
    printCovid19List(covid19fo) 
    saveCovid19List(covid19fo,output_file)

main()
State          	Positive       	Negative  	Pending   	Deaths    	Total test results
All states     	460,287        	1,912,056 	17,776    	16,535    	2,372,343 
Alabama        	2,838          	18,058    	N/A       	78        	20,896    
Alaska         	235            	6,988     	N/A       	7         	7,223     
American Samoa 	0              	20        	11        	0         	20        
Arizona        	3,018          	34,160    	N/A       	89        	37,178    
Arkansas       	1,119          	13,832    	N/A       	21        	14,951    
California     	18,309         	145,191   	14,100    	492       	163,500   
Colorado       	6,202          	24,978    	N/A       	226       	31,180    
Connecticut    	9,784          	23,718    	N/A       	380       	33,502    
Delaware       	1,209          	8,683     	N/A       	23        	9,892     
District Of Columbia	1,523          	7,201     	N/A       	32        	8,724     
Florida        	16,826         	139,862   	1,523     	371       	156,688   
Georgia        	10,885         	30,519    	N/A       	412       	41,404    
Guam           	130            	624       	N/A       	4         	754       
Hawaii         	442            	15,707    	N/A       	6         	16,149    
Idaho          	1,353          	11,741    	N/A       	24        	13,094    
Illinois       	16,422         	64,435    	N/A       	528       	80,857    
Indiana        	6,351          	25,782    	N/A       	245       	32,133    
Iowa           	1,270          	13,703    	N/A       	29        	14,973    
Kansas         	1,106          	9,669     	N/A       	42        	10,775    
Kentucky       	1,452          	21,718    	N/A       	79        	23,170    
Louisiana      	18,283         	68,636    	N/A       	702       	86,919    
Maine          	560            	6,088     	N/A       	16        	6,648     
Maryland       	6,185          	35,344    	N/A       	138       	41,529    
Massachusetts  	18,941         	76,017    	N/A       	503       	94,958    
Michigan       	21,504         	31,362    	N/A       	1,076     	52,866    
Minnesota      	1,242          	31,052    	N/A       	50        	32,294    
Mississippi    	2,260          	18,632    	N/A       	76        	20,892    
Missouri       	3,539          	36,941    	N/A       	77        	40,480    
Montana        	354            	7,506     	N/A       	6         	7,860     
Nebraska       	577            	8,256     	N/A       	15        	8,833     
Nevada         	2,456          	19,315    	N/A       	86        	21,771    
New Hampshire  	819            	9,139     	155       	21        	9,958     
New Jersey     	51,027         	56,165    	N/A       	1,700     	107,192   
New Mexico     	989            	22,942    	N/A       	17        	23,931    
New York       	159,937        	231,612   	N/A       	7,067     	391,549   
North Carolina 	3,651          	44,158    	N/A       	65        	47,809    
North Dakota   	269            	8,721     	N/A       	5         	8,990     
Northern Mariana Islands	11             	27        	9         	2         	38        
Ohio           	5,512          	50,473    	N/A       	213       	55,985    
Oklahoma       	1,684          	18,595    	N/A       	80        	20,279    
Oregon         	1,321          	24,306    	2         	44        	25,627    
Pennsylvania   	18,228         	87,374    	N/A       	338       	105,602   
Puerto Rico    	683            	4,703     	1,304     	33        	5,386     
Rhode Island   	1,727          	10,682    	N/A       	43        	12,409    
South Carolina 	2,792          	23,504    	N/A       	67        	26,296    
South Dakota   	447            	6,700     	N/A       	6         	7,147     
Tennessee      	4,634          	55,215    	N/A       	94        	59,849    
Texas          	10,230         	95,904    	N/A       	199       	106,134   
US Virgin Islands	50             	273       	45        	1         	323       
Utah           	1,976          	36,397    	N/A       	13        	38,373    
Vermont        	628            	7,553     	N/A       	23        	8,181     
Virginia       	4,042          	28,984    	627       	109       	33,026    
Washington     	9,608          	83,391    	N/A       	446       	92,999    
West Virginia  	523            	13,340    	N/A       	5         	13,863    
Wisconsin      	2,885          	31,424    	N/A       	111       	34,309    
Wyoming        	239            	4,736     	N/A       	0         	4,975     
In [12]:
import pandas_datareader.data as web
df = web.DataReader('AXP','stooq')
df.head(5)
Out[12]:
Open High Low Close Volume
Date
2020-04-03 76.00 76.66 72.61 73.60 10023085
2020-04-02 77.06 79.12 75.83 76.66 8338115
2020-04-01 80.65 81.31 77.00 77.84 8313414
2020-03-31 89.94 90.90 85.07 85.61 6076864
2020-03-30 88.51 91.39 85.27 90.27 5564975
In [1]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import pandas_datareader.data as web
start = datetime.datetime(2019,10,1)
end = datetime.date.today()
stock = web.DataReader("600797.SS", "yahoo", start, end)
stock.head()
#plt.plot(stock_info['Close'], 'g')
#plt.show()
Out[1]:
High Low Open Close Volume Adj Close
Date
2019-10-08 8.61 8.43 8.44 8.43 8300799 8.43
2019-10-09 8.53 8.25 8.44 8.53 7688194 8.53
2019-10-10 8.69 8.51 8.56 8.62 11553723 8.62
2019-10-11 8.72 8.51 8.66 8.62 11982673 8.62
2019-10-14 8.87 8.70 8.73 8.83 16223941 8.83
In [9]:
from pandas_datareader import data
import matplotlib.pyplot as plt
import pandas as pd
stock_code = input("美股直接输入股票代码如GOOG \n港股输入代码+对应股市,如腾讯:0700.hk \n国内股票需要区分上证和深证,股票代码后面加.ss或者.sz\n请输入你要查询的股票代码:")
start_date = "2019-01-01"
end_date = "2020-01-01"
stock_info = data.get_data_yahoo(stock_code, start_date, end_date)
print(stock_info.head())
stock_info.to_excel('%s.xlsx'%stock_code)
stock_info.to_csv('%s.csv'%stock_code)
美股直接输入股票代码如GOOG 
港股输入代码+对应股市,如腾讯:0700.hk 
国内股票需要区分上证和深证,股票代码后面加.ss或者.sz
请输入你要查询的股票代码:AXP
                 High        Low       Open      Close     Volume  Adj Close
Date                                                                        
2018-12-31  95.610001  94.300003  95.129997  95.320000  2821100.0  93.100761
2019-01-02  96.269997  93.769997  93.910004  95.680000  4175400.0  93.452377
2019-01-03  95.180000  93.230003  94.790001  93.430000  4776600.0  91.628242
2019-01-04  98.349998  94.900002  95.370003  97.639999  4637200.0  95.757050
2019-01-07  98.919998  97.019997  97.769997  98.169998  3289000.0  96.276825
In [5]:
import pandas as pd
import requests
import re
import json 
from pandas_datareader import data
 
def retrieve_quotes_historical(stock_code):
    quotes = []
    url = 'http://finance.yahoo.com/quote/%s/history?p=%s' % (stock_code, stock_code)
    try:
        r = requests.get(url)
    except ConnectionError as err:
        print(err)
    m = re.findall('"HistoricalPriceStore":{"prices":(.*?),"isPending"', r.text)
    if m:
        quotes = json.loads(m[0]) 
        quotes = quotes[::-1] 

quotes = retrieve_quotes_historical('AXP')
df = pd.DataFrame(quotes)
df.head()
# df.to_cvs('quotes1.cvs')
Out[5]:

sklearn

In [12]:
from sklearn import datasets
dir(datasets)
In [11]:
from sklearn import datasets
iris = datasets.load_iris()
iris.data[1:5]
Out[11]:
array([[4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])
In [24]:
nr = np.size(iris.data,0)
nc = np.size(iris.data,1)
print(nr,nc)
150 4
In [14]:
iris.target
Out[14]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
In [20]:
import numpy as np
from scipy.cluster.vq import vq, kmeans, whiten

whiten = whiten(iris.data) # calculate std row-wise
centroids,_ = kmeans(whiten, 3) # cluster
result,_= vq(whiten, centroids)
print(result)
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 0 0 0 2 0 2 0 0 0 0 0 0 2 0 0 0 0 2 0 0 0
 0 2 2 2 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 2 2 2 2 2 0 2 0 2 0 2 2 0 2 2 2 2 2 2 0 0 2 2 2 2 2 2 2 0 2 2 2 0 2
 2 2]
In [ ]: