https://docs.pinecone.io/integrations/llamaindex

https://docs.llamaindex.ai/en/stable/module_guides/loading/ingestion_pipeline/ // import 부분 참고

Set up your environment

# Install libraries
pip install llama-index==0.10.36
pip install "pinecone-client[grpc]"==3.0.0
pip install arxiv==2.1.0
pip install setuptools==-69.0.3  # (Optional)

# Set environment variables for API keys
export PINECONE_API_KEY=<your Pinecone API key available at app.pinecone.io>
export OPENAI_API_KEY=<your OpenAI API key, available at platform.openai.com/api-keys>
pinecone_api_key = os.environ.get('PINECONE_API_KEY')
openai_api_key = os.environ.get('OPENAI_API_KEY')

Load the data

import arxiv
from pathlib import Path
from llama_index import download_loader

# Download paper to local file system (LFS)
# `id_list` contains 1 item that matches our PDF's arXiv ID
paper = next(arxiv.Client().results(arxiv.Search(id_list=["1603.09320"])))
paper.download_pdf(filename="hnsw.pdf")

# Download and instantiate `PDFReader` from LlamaHub
PDFReader = download_loader("PDFReader")
loader = PDFReader()

# Load HNSW PDF from LFS
documents = loader.load_data(file=Path('./hnsw.pdf'))

# Preview one of our documents
documents[0]
>>> Document(id_='e25106d2-bde5-41f0-83fa-5cbfa8234bef', embedding=None, metadata={'page_label': '1', 'file_name': 'hnsw.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text="IEEE TRANSACTIONS ON  JOURNAL NAME,  MANUS CRIPT ID  1 \\n Efficient and robust approximate nearest \\nneighbor search using Hierarchical Navigable \\nSmall World graphs  \\nYu. A. Malkov,  D. A. Yashunin  \\nAbstract  — We present a new approach for the approximate K -nearest neighbor search based on navigable small world \\ngraphs with controllable hierarchy (Hierarchical NSW , HNSW ) and tree alg o-\\nrithms", start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')

pdf에서 가져온 내용 필요없는 부분 제거 하는

# Clean up our Documents' content
import re

def clean_up_text(content: str) -> str:
    """
    Remove unwanted characters and patterns in text input.

    :param content: Text input.
    
    :return: Cleaned version of original text input.
    """

    # Fix hyphenated words broken by newline
    content = re.sub(r'(\\w+)-\\n(\\w+)', r'\\1\\2', content)

    # Remove specific unwanted patterns and characters
    unwanted_patterns = [
        "\\\\n", "  —", "——————————", "—————————", "—————",
        r'\\\\u[\\dA-Fa-f]{4}', r'\\uf075', r'\\uf0b7'
    ]
    for pattern in unwanted_patterns:
        content = re.sub(pattern, "", content)

    # Fix improperly spaced hyphenated words and normalize whitespace
    content = re.sub(r'(\\w)\\s*-\\s*(\\w)', r'\\1-\\2', content)
    content = re.sub(r'\\s+', ' ', content)

    return content

# Call function
cleaned_docs = []
for d in documents: 
    cleaned_text = clean_up_text(d.text)
    d.text = cleaned_text
    cleaned_docs.append(d)

# Inspect output
cleaned_docs[0].get_content()
>>> "IEEE TRANSACTIONS ON JOURNAL NAME, MANUS CRIPT ID 1 Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs Yu. A. Malkov, D. A. Yashunin Abstract We present a new approach for the approximate K-nearest neighbor search based on navigable small world graphs with controllable hierarchy (Hierarchical NSW , HNSW ) and tree algorithms."

# Great!

Transform the data

정리된 Document 객체를 살펴보면, 메타데이터 딕셔너리의 기본 값들이 그다지 유용하지 않을 수 있다는 의미입니다.