前回まではこんな感じ
jump1268.hatenablog.com
jump1268.hatenablog.com
イメージ画像はこんな感じ
jabstract.py
#!/usr/bin/env python # The MIT License (MIT) # Copyright © 2015 Recruit Technologies Co.,Ltd. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import collections import re import numpy as np import networkx as nx from sklearn.feature_extraction import DictVectorizer from sklearn.metrics import pairwise_distances from janome.tokenizer import Tokenizer import codecs __doc__ = """Japanese summarization module using LexRank algorithm. This module was reconsturcted from summpy. - https://pypi.python.org/pypi/summpy/ - https://github.com/recruit-tech/summpy - https://recruit-tech.co.jp/blog/2015/10/30/summpy-released/ Requirements - numpy - networkx - scikit-learn - janome """ __version__ = "0.1.0" __author__ = "Shumpei IINUMA" __maintainer__ = "Hajime Nakagami<nakagami@gmail.com>" __all__ = ['summarize'] tokenizer = Tokenizer() def is_stopword(n): if len(n.surface) == 0: return True elif re.search(r'^[\s!-@\[-`\{-~ 、-〜!-@[-`]+$', n.surface): return True elif re.search(r'^(接尾|非自立)', n.part_of_speech.split(',')[1]): return True elif u'サ変・スル' == n.infl_form or u'ある' == n.base_form: return True elif re.search(r'^(名詞|動詞|形容詞)', n.part_of_speech.split(',')[0]): return False else: return True def not_stopword(n): return not is_stopword(n) def node2word(n): return n.surface def node2norm_word(n): if n.base_form != '*': return n.base_form else: return n.surface def word_segmenter_ja(sent, node_filter=not_stopword, node2word=node2norm_word): nodes = tokenizer.tokenize(sent) if node_filter: nodes = [n for n in nodes if node_filter(n)] words = [node2word(n) for n in nodes] return words def sent_splitter_ja(text, delimiters=set('。.?!\n\r'), parenthesis='()「」『』“”'): ''' Args: text: string that contains multiple Japanese sentences. delimiters: set() of sentence delimiter characters. parenthesis: to be checked its correspondence. Returns: list of sentences. ''' paren_chars = set(parenthesis) close2open = dict(zip(parenthesis[1::2], parenthesis[0::2])) sentences = [] pstack = [] buff = [] for i, c in enumerate(text): c_next = text[i+1] if i+1 < len(text) else None # check correspondence of parenthesis if c in paren_chars: if c in close2open: # close if len(pstack) > 0 and pstack[-1] == close2open[c]: pstack.pop() else: # open pstack.append(c) buff.append(c) if c in delimiters: if len(pstack) == 0 and c_next not in delimiters: s = ''.join(buff).strip() if s: sentences.append(s) buff = [] if len(buff) > 0: s = ''.join(buff).strip() if s: sentences.append(s) return sentences def lexrank(sentences, continuous=False, sim_threshold=0.1, alpha=0.9): ''' compute centrality score of sentences. Args: sentences: [u'こんにちは.', u'私の名前は飯沼です.', ... ] continuous: if True, apply continuous LexRank. (see reference) sim_threshold: if continuous is False and smilarity is greater or equal to sim_threshold, link the sentences. alpha: the damping factor of PageRank Returns: tuple ( { # sentence index -> score 0: 0.003, 1: 0.002, ... }, similarity_matrix ) Reference: Günes Erkan and Dragomir R. Radev. LexRank: graph-based lexical centrality as salience in text summarization. (section 3) http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html ''' # configure ranker ranker_params = {'max_iter': 1000} ranker = nx.pagerank_scipy ranker_params['alpha'] = alpha graph = nx.DiGraph() # sentence -> tf sent_tf_list = [] for sent in sentences: words = word_segmenter_ja(sent) tf = collections.Counter(words) sent_tf_list.append(tf) sent_vectorizer = DictVectorizer(sparse=True) sent_vecs = sent_vectorizer.fit_transform(sent_tf_list) # compute similarities between senteces sim_mat = 1 - pairwise_distances(sent_vecs, sent_vecs, metric='cosine') linked_rows, linked_cols = np.where( sim_mat > 0 if continuous else sim_mat >= sim_threshold ) # create similarity graph graph.add_nodes_from(range(sent_vecs.shape[0])) for i, j in zip(linked_rows, linked_cols): if i != j: weight = sim_mat[i, j] if continuous else 1.0 graph.add_edge(i, j, weight=weight) scores = ranker(graph, **ranker_params) return scores, sim_mat def summarize(sentences, sent_limit=None, char_limit=None, imp_require=None, **lexrank_params): ''' Args: sentences: text to be summarized or list of sentence sent_limit: summary length (the number of sentences) char_limit: summary length (the number of characters) imp_require: cumulative LexRank score [0.0-1.0] Returns: list of extracted sentences ''' if isinstance(sentences, str): #sentences = sent_splitter_ja(text) sentences = sent_splitter_ja(sentences) scores, sim_mat = lexrank(sentences, **lexrank_params) sum_scores = sum(scores.values()) acc_scores = 0.0 indexes = set() num_sent, num_char = 0, 0 for i in sorted(scores, key=lambda i: scores[i], reverse=True): num_sent += 1 num_char += len(sentences[i]) if sent_limit is not None and num_sent > sent_limit: break if char_limit is not None and num_char > char_limit: break if imp_require is not None and acc_scores / sum_scores >= imp_require: break indexes.add(i) acc_scores += scores[i] if len(indexes) > 0: sentences = [sentences[i] for i in sorted(indexes)] #str(sentences)とするとおかしくなる return sentences # ------------------------------------------------------------------------------------- def _get_bocchan_text(text): import io import requests import zipfile text = re.sub(r'《[^》]+》', '', text) text = re.sub(r'|', '', text) text = re.sub(r'[.+?]', '', text) text = re.sub(r'-----[\s\S]*-----', '', text) text = re.split('底本:', text)[0] return text if __name__ == '__main__': """ text = _get_bocchan_text() test_data = codecs.open("writeme.txt", "r", 'utf-8') text = test_data.read() result = summarize(text, char_limit=400) print('\n\nsummarize(char_limit=400)') print('\n'.join(result)) result = summarize(text, sent_limit=10) print('\n\nsummarize(sent_limit=10)') print('\n'.join(result)) result = summarize(text, char_limit=400, continuous=True) print('\n\nsummarize(char_limit=400, continuous=True)') print('\n'.join(result)) result = summarize(text, sent_limit=10, continuous=True) print('\n\nsummarize(sent_limit=10, continuous=True)') print('\n'.join(result)) """
正直jabstractの中身はほとんど理解できていません。
(ところどころ変えましたが)
my_text.py
import requests as web import bs4 import csv from bs4 import BeautifulSoup import urllib.request as req from urllib.parse import parse_qsl from urllib.parse import urlparse import pandas as pd import codecs from bs4 import BeautifulSoup from requests import get as GET # キーワードを使って検索する #引数は str def check(text): # 引数の文字を調べる。&の前の数字を変えればサイトの数を変えれます。 html = GET('https://www.google.co.jp/search?num=4&q='+' '+text).text bs = BeautifulSoup(html, 'lxml') df = pd.DataFrame(["title"],["url"]) sen = "" for el in bs.select("h3.r a"): title = el.get_text() url = dict(parse_qsl(urlparse(el.get("href")).query))["q"] print(title) print(" ", url) #これはエラーになる。 #new_data = pd.Series([title, url], index=df.columns) new_data = pd.Series([title,url]) df = df.append(new_data, ignore_index=True) # urlopen()でデータを取得 res = req.urlopen(url) # BeautifulSoup()で解析 soup = BeautifulSoup(res, 'html.parser') # スペースじゃ見づらいので、代わりに「?hoge?」に置き換えてみる。 codecs.register_error('hoge', lambda e: (' ', e.end)) #ファイルオープン wは新規、aは追加 f = open('writeme.txt', 'w', encoding='utf-8', errors='hoge') # 任意のデータを抽出 if (soup.find("h1") == None): title = "no title" else : title1 = soup.find("h1").string print("title = ", title1) p_list = soup.find_all("p") for p in p_list: print(p.get_text()) f.write(p.get_text()) sen = str(sen) +str(p.get_text()) f.close() print("******************************************************************") df.to_csv("output.csv", encoding="shift_jis",header=None,index=None,) return sen if __name__ == '__main__': check("大学生 やるべき")