François Mentec commited on
Commit
fbdb6be
·
1 Parent(s): e4275fc

basic structure

Browse files
Files changed (7) hide show
  1. .env.example +1 -0
  2. .gitignore +3 -1
  3. Dockerfile +11 -3
  4. requirements.txt +4 -2
  5. src/bon_livraison.py +33 -0
  6. src/streamlit_app.py +61 -35
  7. src/util.py +48 -0
.env.example ADDED
@@ -0,0 +1 @@
 
 
1
+ MISTRAL_API_KEY=your_mistral_API_key_here
.gitignore CHANGED
@@ -1 +1,3 @@
1
- data/documents
 
 
 
1
+ data/documents
2
+ .env
3
+ __pycache__
Dockerfile CHANGED
@@ -1,20 +1,28 @@
1
  FROM python:3.13.5-slim
2
 
 
3
  WORKDIR /app
4
 
5
  RUN apt-get update && apt-get install -y \
6
  build-essential \
7
  curl \
8
  git \
 
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
- COPY requirements.txt ./
12
- COPY src/ ./src/
13
 
14
  RUN pip3 install -r requirements.txt
15
 
16
  EXPOSE 8501
 
 
 
 
 
 
17
 
18
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19
 
20
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
1
  FROM python:3.13.5-slim
2
 
3
+ RUN useradd -m -u 1000 user
4
  WORKDIR /app
5
 
6
  RUN apt-get update && apt-get install -y \
7
  build-essential \
8
  curl \
9
  git \
10
+ poppler-utils \
11
  && rm -rf /var/lib/apt/lists/*
12
 
13
+ COPY --chown=user requirements.txt ./
14
+ COPY --chown=user .env* ./
15
 
16
  RUN pip3 install -r requirements.txt
17
 
18
  EXPOSE 8501
19
+ COPY --chown=user . /app
20
+
21
+ USER user
22
+
23
+ ENV HOME=/home/user \
24
+ PATH=/home/user/.local/bin:$PATH
25
 
26
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
27
 
28
+ ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.enableXsrfProtection=false"]
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
- altair
2
  pandas
3
- streamlit
 
 
 
 
 
1
  pandas
2
+ streamlit
3
+ dotenv
4
+ mistralai
5
+ pdf2image
src/bon_livraison.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SYSTEM_PROMPT = """
2
+ Extract data in json format
3
+ """
4
+
5
+ def extract_from_bl (client, document_source):
6
+ model = "mistral-medium-2508" # Mistral Medium 3.1
7
+
8
+ messages = [
9
+ {
10
+ "role": "system",
11
+ "content": SYSTEM_PROMPT,
12
+ },
13
+ {
14
+ "role": "user",
15
+ "content": [
16
+ {
17
+ "type": "text",
18
+ "text": "Extract data from this document"
19
+ },
20
+ document_source
21
+ ]
22
+ }
23
+ ]
24
+
25
+ chat_response = client.chat.complete(
26
+ model=model,
27
+ messages=messages,
28
+ response_format = {
29
+ "type": "json_object" #, "json_schema": JSON_SCHEMA
30
+ }
31
+ )
32
+
33
+ return chat_response.choices[0].message.content
src/streamlit_app.py CHANGED
@@ -2,39 +2,65 @@ import altair as alt
2
  import numpy as np
3
  import pandas as pd
4
  import streamlit as st
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import numpy as np
3
  import pandas as pd
4
  import streamlit as st
5
+ from dotenv import load_dotenv
6
+ import os
7
+ from mistralai import Mistral
8
 
9
+ from util import display_pdf, upload_pdf
10
+ from bon_livraison import extract_from_bl
11
+
12
+ load_dotenv()
13
+
14
+ MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY")
15
+
16
+ def main():
17
+ """
18
+ Main function to run the Streamlit app.
19
+ """
20
+
21
+ # Sidebar: Authentication for Mistral API
22
+ if not MISTRAL_API_KEY:
23
+ api_key = st.sidebar.text_input("Mistral API Key", type="password")
24
+ else:
25
+ api_key = MISTRAL_API_KEY
26
+
27
+ if not api_key:
28
+ st.warning("Enter API key to continue")
29
+ return
30
+
31
+ # Initialize Mistral API client
32
+ client = Mistral(api_key=api_key)
33
+
34
+ uploaded_file = st.file_uploader("Choisissez un PDF", type=["pdf"])
35
+
36
+ document_source = None
37
+
38
+ if uploaded_file:
39
+ content = uploaded_file.read()
40
+ preview_content = uploaded_file
41
+
42
+ # Display the uploaded PDF
43
+ display_pdf(content)
44
+
45
+ # Prepare document source for OCR processing
46
+ document_source = {
47
+ "type": "document_url",
48
+ "document_url": upload_pdf(client, content, uploaded_file.name)
49
+ }
50
+ content_type = "pdf"
51
+
52
+ if document_source and st.button("Générer les données au format JSON"):
53
+ # Process the document when the user clicks the button
54
+ with st.spinner("Extracting JSON content..."):
55
+ try:
56
+ response = extract_from_bl(client, document_source)
57
+
58
+ with st.expander("Response"):
59
+ st.json(response)
60
+
61
+ except Exception as e:
62
+ # Display an error message if processing fails
63
+ st.error(f"Processing error: {str(e)}")
64
+
65
+ if __name__ == "__main__":
66
+ main()
src/util.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdf2image import convert_from_bytes
2
+ import streamlit as st
3
+ import os
4
+ import tempfile
5
+
6
+ def display_pdf(content: bytes):
7
+ try:
8
+ images = convert_from_bytes(content)
9
+ for i, image in enumerate(images):
10
+ st.image(image, caption=f"Page {i+1}", use_container_width=True)
11
+ except Exception as e:
12
+ st.error(f"Impossible d'afficher le PDF : {e}")
13
+ st.download_button(
14
+ label="📥 Télécharger le PDF",
15
+ data=content,
16
+ file_name="document.pdf",
17
+ mime="application/pdf"
18
+ )
19
+
20
+ def upload_pdf(client, content, filename):
21
+ """
22
+ Uploads a PDF to Mistral's API and retrieves a signed URL for processing.
23
+
24
+ Args:
25
+ client (Mistral): Mistral API client instance.
26
+ content (bytes): The content of the PDF file.
27
+ filename (str): The name of the PDF file.
28
+ Returns:
29
+ str: Signed URL for the uploaded PDF.
30
+ """
31
+ with tempfile.TemporaryDirectory() as temp_dir:
32
+ temp_path = os.path.join(temp_dir, filename)
33
+
34
+ with open(temp_path, "wb") as tmp:
35
+ tmp.write(content)
36
+
37
+ try:
38
+ with open(temp_path, "rb") as file_obj:
39
+ file_upload = client.files.upload(
40
+ file={"file_name": filename, "content": file_obj},
41
+ purpose="ocr"
42
+ )
43
+
44
+ signed_url = client.files.get_signed_url(file_id=file_upload.id)
45
+ return signed_url.url
46
+ finally:
47
+ if os.path.exists(temp_path):
48
+ os.remove(temp_path)