import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
import json
import time
import numpy as np
#スクレイピングしてタイトルとポイントをjson形式で出力#リンク一覧ページからランキングサイトのリンクを取得
link_list=[]
for i in range(1,11):
url_ranking_list_root="https://toyokeizai.net/category/weeklyranking"
page_url="?page="+str(i)+"&per_page=15"
url_ranking_list=url_ranking_list_root+page_url
html=urllib.request.urlopen(url_ranking_list)
soup=BeautifulSoup(html,"lxml")
link_html_list=soup.find_all("a",class_="link-box")
for link_html in link_html_list:
link_list.append(link_html.get("href"))
time.sleep(1)
link_list=list(set(link_list))
#辞書作成
dict={}
for n in range(0,len(link_list)):
for page_num in range(2,6):
root_url="https://toyokeizai.net"
page_url="?page="+str(page_num)
url=root_url+link_list[n]+page_url
html=urllib.request.urlopen(url)
soup=BeautifulSoup(html,"lxml")
title_list=soup.find("tbody").find_all("a")
for i in range(len(title_list)):
if title_list[i].text notin dict:
dict[title_list[i].text]=200-(i+50*(page_num-2))
else:
dict[title_list[i].text]+=200-(i+50*(page_num-2))
time.sleep(1)
#json形式で出力with open("book_data.json","w",encoding="utf-8") as f:
json.dump(dict,f,indent=4,ensure_ascii=False)
from janome.tokenizer import Tokenizer
import numpy as np
import pandas as pd
with open("book_data.json","r",encoding="utf-8") as f:
dict=json.load(f)
title_list=list(dict.keys())
point_list=np.array(list(dict.values()))
title_wakati=[]
for i in range(len(title_list)):
#正規表現で不要頻出語を削除
title=title_list[i]
title=re.sub("新書","",title)
title=re.sub("単行本","",title)
title=re.sub("雑誌","",title)
title=re.sub("文庫","",title)
title=re.sub("[。!-/:-@[-`{-~]","",title)
t=Tokenizer()
tokens = t.tokenize(title)
for token in tokens:
partOfSpeech = token.part_of_speech.split(",")[0]
surface = token.surface
#1語の単語を削除して名詞と動詞のみを取り出しif len(surface) > 1:
if partOfSpeech == "名詞":
title_wakati.append(surface)
if partOfSpeech == "動詞":
title_wakati.append(surface)
#単語数をカウント
d_count=collections.Counter(title_wakati)
d_count=d_count.most_common
#DataFrame化
word_dct={"単語":[],
"出現回数":[]}
for i in d_count:
word_dct["単語"].append(i[0])
word_dct["出現回数"].append(i[1])
df=pd.DataFrame(word_dct)
df=df_top.sort_values("出現回数",ascending=False)
df.reset_index(drop=True,inplace=True)
df.to_csv("book_data.csv")
with open("book_data.json","r",encoding="utf-8") as f:
dict=json.load(f)
#ポイントに√をかける
title_list=list(dict.keys())
point_list=np.array(list(dict.values()))
point_list_root=np.sqrt(np.abs(point_list))
#ポイントに応じて単語を重みづけ
title_list_ex=[]
for i in range(len(point_list_root)):
title_list_ex.append(title_list[i]*int(point_list_root[i]))
title_wakati=[]
for i in range(len(title_list_ex)):
#正規表現で不要頻出語を削除
title=title_list_ex[i]
title=re.sub("新書","",title)
title=re.sub("単行本","",title)
title=re.sub("雑誌","",title)
title=re.sub("文庫","",title)
title=re.sub("[。!-/:-@[-`{-~]","",title)
t=Tokenizer()
tokens = t.tokenize(title)
for token in tokens:
partOfSpeech = token.part_of_speech.split(",")[0]
surface = token.surface
#1語の単語を削除して名詞と動詞のみを取り出しif len(surface) > 1:
if partOfSpeech == "名詞":
title_wakati.append(surface)
if partOfSpeech == "動詞":
title_wakati.append(surface)
#単語数をカウント
d_count=collections.Counter(title_wakati)
d_count=d_count.most_common()
#DataFrame化
word_dct={"単語":[],
"出現頻度":[]}
for i in d_count:
word_dct["単語"].append(i[0])
word_dct["出現頻度"].append(i[1])
df_ex=pd.DataFrame(word_dct)
df_ex=df_ex.sort_values("出現頻度",ascending=False)
df_ex.reset_index(drop=True,inplace=True)
df_ex.to_csv("book_data_ex.csv")
with open("book data.json","r",encoding="utf-8") as f:
dict=json.load(f)
#ポイントによって辞書をソート
dict=sorted(dict.items(),key=lambda x:x[1],reverse=True)
title_list=[]
point_list=[]
for i in dict:
title_list.append(i[0])
point_list.append(i[1])
#上位10%と下位10%を抽出
title_list_top=title_list[:300]
title_list_bottom=title_list[2683:]
#以下同じ
with open("book_data.json","r",encoding="utf-8") as f:
dict=json.load(f)
title_list=list(dict.keys())
point_list=np.array(list(dict.values()))
df=pd.read_csv("book_data_new.csv",index_col=0)
words_list=list(df["単語"][:100])
title_score=[[] for i in range(len(title_list))]
for i in range(len(title_list)):
title=title_list[i]
for word in words_list:
if word in title:
title_score[i].append(1)
else:
title_score[i].append(0)
df_score=pd.DataFrame(title_score)
df_score.columns=words_list
df_score.index=title_list
出力結果はこのようなデータフレームになります
scikit-learnのライブラリfeature_selectionを用いて
売上ポイントに最も寄与した単語を指定した個数(今回は20個)だけ取り出します
変数選択手法について aotamasaki.hatenablog.com f
eature_selectionについて qiita.com
from sklearn.feature_selection import SelectKBest,f_regression
selector=SelectKBest(k=20)
selector.fit(df_score,point_list)
X_selected=selector.transform(df_score)
mask=selector.get_support()
scores=selector.scores_
score_dct={}
for i in range(len(mask)):
if mask[i]==True:
score_dct[words_list[i]]=scores[i]
for k, v in sorted(score_dct.items(), key=lambda x: -x[1]):
print(str(k) + ": " + str(v))
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
selector=RFE(LinearRegression(),n_features_to_select=20)
selector.fit(df_score,point_list)
mask=selector.get_support()
for i in range(len(mask)):
if mask[i]==True:
print(words_list[i])