name: llm-langchain-env
channels:
  - conda-forge
dependencies:
  - python=3.12.7
  - pip
  - pandas
  - poppler  # Required for pdf2image (used by unstructured for hi_res)
  - tesseract  # Tesseract OCR engine
  - scikit-learn  # Added for cosine_similarity and other sklearn functionality
  - pip:
      - chromadb
      - langchain==0.3.20
      - langchain-core==0.3.41
      - langchain-openai==0.3.3
      - langchain-community==0.3.16
      - python-dotenv==1.0.1
      - ipykernel==6.29.4
      - pypdf
      - beautifulsoup4
      - youtube-transcript-api
      - pytube
      - unstructured[local-inference]  # Includes dependencies for hi_res and table extraction
      - pytesseract  # Python wrapper for Tesseract OCR