https://docs.pinecone.io/integrations/llamaindex
https://docs.llamaindex.ai/en/stable/module_guides/loading/ingestion_pipeline/ // import 부분 참고
# Install libraries
pip install llama-index==0.10.36
pip install "pinecone-client[grpc]"==3.0.0
pip install arxiv==2.1.0
pip install setuptools==-69.0.3 # (Optional)
# Set environment variables for API keys
export PINECONE_API_KEY=<your Pinecone API key available at app.pinecone.io>
export OPENAI_API_KEY=<your OpenAI API key, available at platform.openai.com/api-keys>
pinecone_api_key = os.environ.get('PINECONE_API_KEY')
openai_api_key = os.environ.get('OPENAI_API_KEY')
import arxiv
from pathlib import Path
from llama_index import download_loader
# Download paper to local file system (LFS)
# `id_list` contains 1 item that matches our PDF's arXiv ID
paper = next(arxiv.Client().results(arxiv.Search(id_list=["1603.09320"])))
paper.download_pdf(filename="hnsw.pdf")
# Download and instantiate `PDFReader` from LlamaHub
PDFReader = download_loader("PDFReader")
loader = PDFReader()
# Load HNSW PDF from LFS
documents = loader.load_data(file=Path('./hnsw.pdf'))
# Preview one of our documents
documents[0]
>>> Document(id_='e25106d2-bde5-41f0-83fa-5cbfa8234bef', embedding=None, metadata={'page_label': '1', 'file_name': 'hnsw.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text="IEEE TRANSACTIONS ON JOURNAL NAME, MANUS CRIPT ID 1 \\n Efficient and robust approximate nearest \\nneighbor search using Hierarchical Navigable \\nSmall World graphs \\nYu. A. Malkov, D. A. Yashunin \\nAbstract — We present a new approach for the approximate K -nearest neighbor search based on navigable small world \\ngraphs with controllable hierarchy (Hierarchical NSW , HNSW ) and tree alg o-\\nrithms", start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')
pdf에서 가져온 내용 필요없는 부분 제거 하는
# Clean up our Documents' content
import re
def clean_up_text(content: str) -> str:
"""
Remove unwanted characters and patterns in text input.
:param content: Text input.
:return: Cleaned version of original text input.
"""
# Fix hyphenated words broken by newline
content = re.sub(r'(\\w+)-\\n(\\w+)', r'\\1\\2', content)
# Remove specific unwanted patterns and characters
unwanted_patterns = [
"\\\\n", " —", "——————————", "—————————", "—————",
r'\\\\u[\\dA-Fa-f]{4}', r'\\uf075', r'\\uf0b7'
]
for pattern in unwanted_patterns:
content = re.sub(pattern, "", content)
# Fix improperly spaced hyphenated words and normalize whitespace
content = re.sub(r'(\\w)\\s*-\\s*(\\w)', r'\\1-\\2', content)
content = re.sub(r'\\s+', ' ', content)
return content
# Call function
cleaned_docs = []
for d in documents:
cleaned_text = clean_up_text(d.text)
d.text = cleaned_text
cleaned_docs.append(d)
# Inspect output
cleaned_docs[0].get_content()
>>> "IEEE TRANSACTIONS ON JOURNAL NAME, MANUS CRIPT ID 1 Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs Yu. A. Malkov, D. A. Yashunin Abstract We present a new approach for the approximate K-nearest neighbor search based on navigable small world graphs with controllable hierarchy (Hierarchical NSW , HNSW ) and tree algorithms."
# Great!
정리된 Document
객체를 살펴보면, 메타데이터 딕셔너리의 기본 값들이 그다지 유용하지 않을 수 있다는 의미입니다.