エンジニア

なんくるないさ

「このブログはアフィリエイト広告を利用しています」

文章要約ウェブプログラムを書いてみた。(モジュール側)

前回まではこんな感じ
jump1268.hatenablog.com
jump1268.hatenablog.com




イメージ画像はこんな感じ
f:id:jump1268:20181229165450p:plain


jabstract.py

#!/usr/bin/env python
# The MIT License (MIT)
# Copyright © 2015 Recruit Technologies Co.,Ltd.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import collections
import re
import numpy as np
import networkx as nx

from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import pairwise_distances

from janome.tokenizer import Tokenizer
import codecs

__doc__ = """Japanese summarization module using LexRank algorithm.
This module was reconsturcted from summpy.
   - https://pypi.python.org/pypi/summpy/
   - https://github.com/recruit-tech/summpy
   - https://recruit-tech.co.jp/blog/2015/10/30/summpy-released/
Requirements
   - numpy
   - networkx
   - scikit-learn
   - janome
"""
__version__ = "0.1.0"
__author__ = "Shumpei IINUMA"
__maintainer__ = "Hajime Nakagami<nakagami@gmail.com>"
__all__ = ['summarize']


tokenizer = Tokenizer()


def is_stopword(n):
    if len(n.surface) == 0:
        return True
    elif re.search(r'^[\s!-@\[-`\{-~ 、-〜!-@[-`]+$', n.surface):
        return True
    elif re.search(r'^(接尾|非自立)', n.part_of_speech.split(',')[1]):
        return True
    elif u'サ変・スル' == n.infl_form or u'ある' == n.base_form:
        return True
    elif re.search(r'^(名詞|動詞|形容詞)', n.part_of_speech.split(',')[0]):
        return False
    else:
        return True


def not_stopword(n):
    return not is_stopword(n)


def node2word(n):
    return n.surface


def node2norm_word(n):
    if n.base_form != '*':
        return n.base_form
    else:
        return n.surface


def word_segmenter_ja(sent, node_filter=not_stopword, node2word=node2norm_word):
    nodes = tokenizer.tokenize(sent)

    if node_filter:
        nodes = [n for n in nodes if node_filter(n)]
    words = [node2word(n) for n in nodes]

    return words


def sent_splitter_ja(text, delimiters=set('。.?!\n\r'), parenthesis='()「」『』“”'):
    '''
    Args:
      text: string that contains multiple Japanese sentences.
      delimiters: set() of sentence delimiter characters.
      parenthesis: to be checked its correspondence.
    Returns:
      list of sentences.
    '''
    paren_chars = set(parenthesis)
    close2open = dict(zip(parenthesis[1::2], parenthesis[0::2]))

    sentences = []
    pstack = []
    buff = []

    for i, c in enumerate(text):
        c_next = text[i+1] if i+1 < len(text) else None
        # check correspondence of parenthesis
        if c in paren_chars:
            if c in close2open:  # close
                if len(pstack) > 0 and pstack[-1] == close2open[c]:
                    pstack.pop()
            else:  # open
                pstack.append(c)

        buff.append(c)
        if c in delimiters:
            if len(pstack) == 0 and c_next not in delimiters:
                s = ''.join(buff).strip()
                if s:
                    sentences.append(s)
                buff = []
    if len(buff) > 0:
        s = ''.join(buff).strip()
        if s:
            sentences.append(s)

    return sentences


def lexrank(sentences, continuous=False, sim_threshold=0.1, alpha=0.9):
    '''
    compute centrality score of sentences.
    Args:
      sentences: [u'こんにちは.', u'私の名前は飯沼です.', ... ]
      continuous: if True, apply continuous LexRank. (see reference)
      sim_threshold: if continuous is False and smilarity is greater or
        equal to sim_threshold, link the sentences.
      alpha: the damping factor of PageRank
    Returns: tuple
      (
        {
          # sentence index -> score
          0: 0.003,
          1: 0.002,
          ...
        },
        similarity_matrix
      )
    Reference:
      Günes Erkan and Dragomir R. Radev.
      LexRank: graph-based lexical centrality as salience in text
      summarization. (section 3)
      http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html
    '''
    # configure ranker
    ranker_params = {'max_iter': 1000}
    ranker = nx.pagerank_scipy
    ranker_params['alpha'] = alpha

    graph = nx.DiGraph()

    # sentence -> tf
    sent_tf_list = []
    for sent in sentences:
        words = word_segmenter_ja(sent)
        tf = collections.Counter(words)
        sent_tf_list.append(tf)

    sent_vectorizer = DictVectorizer(sparse=True)
    sent_vecs = sent_vectorizer.fit_transform(sent_tf_list)

    # compute similarities between senteces
    sim_mat = 1 - pairwise_distances(sent_vecs, sent_vecs, metric='cosine')

    linked_rows, linked_cols = np.where(
        sim_mat > 0 if continuous else sim_mat >= sim_threshold
    )

    # create similarity graph
    graph.add_nodes_from(range(sent_vecs.shape[0]))
    for i, j in zip(linked_rows, linked_cols):
        if i != j:
            weight = sim_mat[i, j] if continuous else 1.0
            graph.add_edge(i, j, weight=weight)

    scores = ranker(graph, **ranker_params)
    return scores, sim_mat


def summarize(sentences, sent_limit=None, char_limit=None, imp_require=None, **lexrank_params):
    '''
    Args:
      sentences: text to be summarized or list of sentence
      sent_limit: summary length (the number of sentences)
      char_limit: summary length (the number of characters)
      imp_require: cumulative LexRank score [0.0-1.0]
    Returns:
      list of extracted sentences
    '''
    
    if isinstance(sentences, str):
        #sentences = sent_splitter_ja(text)
        sentences = sent_splitter_ja(sentences)

    scores, sim_mat = lexrank(sentences, **lexrank_params)
    sum_scores = sum(scores.values())
    acc_scores = 0.0
    indexes = set()
    num_sent, num_char = 0, 0
    for i in sorted(scores, key=lambda i: scores[i], reverse=True):
        num_sent += 1
        num_char += len(sentences[i])
        if sent_limit is not None and num_sent > sent_limit:
            break
        if char_limit is not None and num_char > char_limit:
            break
        if imp_require is not None and acc_scores / sum_scores >= imp_require:
            break
        indexes.add(i)
        acc_scores += scores[i]

    if len(indexes) > 0:
        sentences = [sentences[i] for i in sorted(indexes)]
#str(sentences)とするとおかしくなる
    return sentences

# -------------------------------------------------------------------------------------


def _get_bocchan_text(text):
    import io
    import requests
    import zipfile

    text = re.sub(r'《[^》]+》', '', text)
    text = re.sub(r'|', '', text)
    text = re.sub(r'[.+?]', '', text)
    text = re.sub(r'-----[\s\S]*-----', '', text)
    text = re.split('底本:', text)[0]

    return text


if __name__ == '__main__':
    """
    text = _get_bocchan_text()

    test_data = codecs.open("writeme.txt", "r", 'utf-8')
    text = test_data.read()

    result = summarize(text, char_limit=400)
    print('\n\nsummarize(char_limit=400)')
    print('\n'.join(result))

    result = summarize(text, sent_limit=10)
    print('\n\nsummarize(sent_limit=10)')
    print('\n'.join(result))

    result = summarize(text, char_limit=400, continuous=True)
    print('\n\nsummarize(char_limit=400, continuous=True)')
    print('\n'.join(result))

    result = summarize(text, sent_limit=10, continuous=True)
    print('\n\nsummarize(sent_limit=10, continuous=True)')
    print('\n'.join(result))
    """

正直jabstractの中身はほとんど理解できていません。
(ところどころ変えましたが)


my_text.py

import requests as web
import bs4
import csv
from bs4 import BeautifulSoup
import urllib.request as req
from urllib.parse import parse_qsl
from urllib.parse import urlparse
import pandas as pd
import codecs

from bs4 import BeautifulSoup
from requests import get as GET

# キーワードを使って検索する

#引数は str
    
def check(text):
   # 引数の文字を調べる。&の前の数字を変えればサイトの数を変えれます。
    html = GET('https://www.google.co.jp/search?num=4&q='+' '+text).text
    bs = BeautifulSoup(html, 'lxml')
    df = pd.DataFrame(["title"],["url"])
    sen = ""
    for el in bs.select("h3.r a"):
        title = el.get_text()
        url = dict(parse_qsl(urlparse(el.get("href")).query))["q"]
        print(title)
        print("  ", url)

        #これはエラーになる。
        #new_data = pd.Series([title, url], index=df.columns)
        new_data = pd.Series([title,url])

        df = df.append(new_data, ignore_index=True)



        # urlopen()でデータを取得
        res = req.urlopen(url)
        # BeautifulSoup()で解析
        soup = BeautifulSoup(res, 'html.parser')

        # スペースじゃ見づらいので、代わりに「?hoge?」に置き換えてみる。
        codecs.register_error('hoge', lambda e: (' ', e.end))
        #ファイルオープン wは新規、aは追加
        f = open('writeme.txt', 'w', encoding='utf-8', errors='hoge')



        # 任意のデータを抽出
        if (soup.find("h1") == None):
            title = "no title"
        else :
            title1 = soup.find("h1").string
        print("title = ", title1)
        p_list = soup.find_all("p")
        for p in p_list:
            print(p.get_text())
            f.write(p.get_text())
            sen = str(sen) +str(p.get_text())



        f.close()
    print("******************************************************************")
    df.to_csv("output.csv", encoding="shift_jis",header=None,index=None,)
    return sen

        

if __name__ == '__main__':
    check("大学生 やるべき")