!pip install wikipedia transformers sentence_transformers faiss-cpu -q
import wikipedia
from wikipedia.exceptions import DisambiguationError
from transformers import pipeline
def divide_chunks(l, n):
    # looping till length l
    for i in range(0, len(l), n): 
        yield l[i:i + n]

def get_passages(text, k=100):
    tokens = text.split(" ")
    tokens_chunks = list(divide_chunks(tokens, k))
    passages = [" ".join(c) for c in tokens_chunks]
    return passages

def get_passage_for_question(question, wiki_hits=3, passage_len=100, debug=False):
  top_hits =, wiki_hits)
  if debug:
    print("Top Wiki hits :", top_hits)
  passages = []
  for hit in top_hits:
      html_page = = hit, auto_suggest = False)
    except DisambiguationError:
    hit_passages = get_passages(html_page.content, k=passage_len)

  return passages
qa = pipeline("question-answering", model="ankur310794/roberta-base-squad2-nq")
from transformers import TFAutoModel, AutoTokenizer
def combine_results(passages, k=4):
  passages_list = list(divide_chunks(passages, k))
  passages_str = [" ".join(p) for p in passages_list]
  return passages_str
passage_encoder = TFAutoModel.from_pretrained("nlpconnect/dpr-ctx_encoder_bert_uncased_L-2_H-128_A-2")
query_encoder = TFAutoModel.from_pretrained("nlpconnect/dpr-question_encoder_bert_uncased_L-2_H-128_A-2")

p_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/dpr-ctx_encoder_bert_uncased_L-2_H-128_A-2")
q_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/dpr-question_encoder_bert_uncased_L-2_H-128_A-2")

import numpy as np
def extracted_passage_embeddings(processed_passages, max_length=156):
    passage_inputs = p_tokenizer.batch_encode_plus(
    passage_embeddings = passage_encoder.predict([np.array(passage_inputs['input_ids']), 
    return passage_embeddings

def extracted_query_embeddings(queries, max_length=64):
    query_inputs = q_tokenizer.batch_encode_plus(
    query_embeddings = query_encoder.predict([np.array(query_inputs['input_ids']), 
    return query_embeddings
import faiss
import spacy
nlp = spacy.load("en")
def get_answer_full_sent(m_passages, answer_dict):
  all_sents = list(nlp(m_passages).sents)
  all_sents = [s.text for s in all_sents]

  for i in range(len(all_sents)):
    if len("".join(all_sents[0:i])[answer_dict['start']:answer_dict['end']])>2:
      answer_dict['answer_sentence'] = all_sents[i-1]
      return answer_dict
  return answer_dict
from sentence_transformers import CrossEncoder
ranking_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=196)
def get_reranked_passage(passages, question, top_rr):
  passage_question_pair = [(question, p) for p in passages]
  scores = ranking_model.predict(passage_question_pair)
  shorted_index = np.argpartition(scores, -top_rr)[::-1]
  shorted_scores = np.array([scores[i] for i in shorted_index])
  return [passages[i] for i in shorted_index[0:top_rr]]
# end to end with dpr
import pandas as pd

def get_answer_dpr(question):
  passages = get_passage_for_question(question, debug=True)
  print("Total passages: ", len(passages))
  passage_embeddings = extracted_passage_embeddings(passages)
  query_embeddings = extracted_query_embeddings([question])
  faiss_index = faiss.IndexFlatL2(128)
  prob, index =, k=topk_r)
  r_passages = [passages[i] for i in index[0]]
  print("Top k retrieved passages :", len(r_passages))
  rr_passages = get_reranked_passage(r_passages, question, topk_rr)
  print("Top k reranked passages :", len(rr_passages))
  m_passages = combine_results(rr_passages)
  print("Merged passages :", len(m_passages))
  results = qa(question=[question]*len(m_passages), context=m_passages, max_seq_len=512)
  if isinstance(results, dict):
    results = [results]
  output_results = [get_answer_full_sent(m_passages[i],results[i]) for i in range(len(results))]
  return pd.DataFrame(output_results)[['answer', 'answer_sentence', 'score']].sort_values("score", ascending=False)
results= get_answer_dpr("where was tara located in gone with the wind?")
Top Wiki hits : ['Tara (plantation)', 'Margaret Mitchell', 'RKO Forty Acres']
Total passages:  95
Top k retrieved passages : 30
Top k reranked passages : 8
Merged passages : 2

answer answer_sentence score
0 Talmadge Farms Now the Tara facade is still located at Talmad... 0.740677
1 virtually the same In the 2007 novel by Donald McCaig, Rhett Butl... 0.159294
results.sort_values("score", ascending=False)
answer answer_sentence score
0 Talmadge Farms Now the Tara facade is still located at Talmad... 0.740677
1 virtually the same In the 2007 novel by Donald McCaig, Rhett Butl... 0.159294
!pip install gradio -q
import gradio as gr
inp = gr.inputs.Textbox(lines=2, default='what is coronavirus?', label="Question")
out = gr.outputs.Dataframe(label="Answers")#gr.outputs.Textbox(label="Answers")
gr.Interface(fn=get_answer_dpr, inputs=inp, outputs=out).launch()
Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
(<Flask 'gradio.networking'>,