thiagohersan commited on
Commit
afb7934
·
verified ·
1 Parent(s): c5e27c9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -8
app.py CHANGED
@@ -3,24 +3,47 @@ import numpy as np
3
 
4
  from transformers import pipeline
5
 
6
- tts = pipeline(task="text-to-speech", model="facebook/mms-tts-eng")
7
-
8
  # caption = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
9
  caption = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
10
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def run_tts(txt):
12
  res = tts(txt)
13
  audio = (res["audio"].reshape(-1) * 2 ** 15).astype(np.int16)
14
  return res["sampling_rate"], audio
15
 
16
- def run_caption(img):
17
- res = caption(img, max_new_tokens=128)
18
- return res[0]["generated_text"]
19
 
20
  def run_caption_tts(img):
21
  return run_tts(run_caption(img))
22
 
 
 
 
 
23
  with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
 
24
  gr.Interface(
25
  run_tts,
26
  inputs=gr.Textbox(),
@@ -28,13 +51,13 @@ with gr.Blocks() as demo:
28
  )
29
 
30
  gr.Interface(
31
- run_caption,
32
  inputs=gr.Image(type="pil"),
33
- outputs="text",
34
  )
35
 
36
  gr.Interface(
37
- run_caption_tts,
38
  inputs=gr.Image(type="pil"),
39
  outputs="audio",
40
  )
 
3
 
4
  from transformers import pipeline
5
 
 
 
6
  # caption = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
7
  caption = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
8
 
9
+ generate = pipeline("text-generation", model="openai-community/gpt2-xl")
10
+
11
+ tts = pipeline(task="text-to-speech", model="facebook/mms-tts-eng")
12
+
13
+ def run_caption(img):
14
+ res = caption(img, max_new_tokens=128)
15
+ return res[0]["generated_text"]
16
+
17
+ def run_generate(txt):
18
+ res = generate(txt, max_length=50)
19
+ return res[0]["generated_text"]
20
+
21
  def run_tts(txt):
22
  res = tts(txt)
23
  audio = (res["audio"].reshape(-1) * 2 ** 15).astype(np.int16)
24
  return res["sampling_rate"], audio
25
 
 
 
 
26
 
27
  def run_caption_tts(img):
28
  return run_tts(run_caption(img))
29
 
30
+ def run_caption_generate_tts(img):
31
+ return run_tts(run_generate(run_caption(img)))
32
+
33
+
34
  with gr.Blocks() as demo:
35
+ gr.Interface(
36
+ run_caption,
37
+ inputs=gr.Image(type="pil"),
38
+ outputs="text",
39
+ )
40
+
41
+ gr.Interface(
42
+ run_generate,
43
+ inputs="text",
44
+ outputs="text",
45
+ )
46
+
47
  gr.Interface(
48
  run_tts,
49
  inputs=gr.Textbox(),
 
51
  )
52
 
53
  gr.Interface(
54
+ run_caption_tts,
55
  inputs=gr.Image(type="pil"),
56
+ outputs="audio",
57
  )
58
 
59
  gr.Interface(
60
+ run_caption_generate_tts,
61
  inputs=gr.Image(type="pil"),
62
  outputs="audio",
63
  )