Yinuo Zhang commited on
Commit
d65f3a2
·
1 Parent(s): d79b4f8

upload data

Browse files
Files changed (47) hide show
  1. __pycache__/dataset.cpython-310.pyc +0 -0
  2. __pycache__/main.cpython-310.pyc +0 -0
  3. __pycache__/main.cpython-313.pyc +0 -0
  4. data/smiles/11M_smiles_old_tokenizer_no_limit/dataset_dict.json +1 -0
  5. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00000-of-00024.arrow +3 -0
  6. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00001-of-00024.arrow +3 -0
  7. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00002-of-00024.arrow +3 -0
  8. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00003-of-00024.arrow +3 -0
  9. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00004-of-00024.arrow +3 -0
  10. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00005-of-00024.arrow +3 -0
  11. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00006-of-00024.arrow +3 -0
  12. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00007-of-00024.arrow +3 -0
  13. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00008-of-00024.arrow +3 -0
  14. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00009-of-00024.arrow +3 -0
  15. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00010-of-00024.arrow +3 -0
  16. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00011-of-00024.arrow +3 -0
  17. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00012-of-00024.arrow +3 -0
  18. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00013-of-00024.arrow +3 -0
  19. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00014-of-00024.arrow +3 -0
  20. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00015-of-00024.arrow +3 -0
  21. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00016-of-00024.arrow +3 -0
  22. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00017-of-00024.arrow +3 -0
  23. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00018-of-00024.arrow +3 -0
  24. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00019-of-00024.arrow +3 -0
  25. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00020-of-00024.arrow +3 -0
  26. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00021-of-00024.arrow +3 -0
  27. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00022-of-00024.arrow +3 -0
  28. data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00023-of-00024.arrow +3 -0
  29. data/smiles/11M_smiles_old_tokenizer_no_limit/train/dataset_info.json +35 -0
  30. data/smiles/11M_smiles_old_tokenizer_no_limit/train/state.json +82 -0
  31. data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00000-of-00012.arrow +3 -0
  32. data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00001-of-00012.arrow +3 -0
  33. data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00002-of-00012.arrow +3 -0
  34. data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00003-of-00012.arrow +3 -0
  35. data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00004-of-00012.arrow +3 -0
  36. data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00005-of-00012.arrow +3 -0
  37. data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00006-of-00012.arrow +3 -0
  38. data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00007-of-00012.arrow +3 -0
  39. data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00008-of-00012.arrow +3 -0
  40. data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00009-of-00012.arrow +3 -0
  41. data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00010-of-00012.arrow +3 -0
  42. data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00011-of-00012.arrow +3 -0
  43. data/smiles/11M_smiles_old_tokenizer_no_limit/val/dataset_info.json +35 -0
  44. data/smiles/11M_smiles_old_tokenizer_no_limit/val/state.json +46 -0
  45. dataset.py +1 -1
  46. main.py +166 -165
  47. utils/.ipynb_checkpoints/app-checkpoint.py +1255 -0
__pycache__/dataset.cpython-310.pyc ADDED
Binary file (6.26 kB). View file
 
__pycache__/main.cpython-310.pyc ADDED
Binary file (7.41 kB). View file
 
__pycache__/main.cpython-313.pyc ADDED
Binary file (13.1 kB). View file
 
data/smiles/11M_smiles_old_tokenizer_no_limit/dataset_dict.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"splits": ["train", "val"]}
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00000-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2871015033b5ee02228dbaa80ad9939a7902b062b9b10bdc6bc27f69fe8cd2e
3
+ size 477588328
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00001-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:286ae4a066882217ff95c91ca5315427ce1a9a8e0a11e72a9646beb9d8d286b0
3
+ size 464191456
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00002-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a88f73782ad97f90ea04198cdf1d9f1d685d812026f298edaea9ce9beca00570
3
+ size 479308880
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00003-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd3615d74a0d36c86df8d6056d7203ba8a6caabf5c09a83192c6712bb4bd75d1
3
+ size 492330784
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00004-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7bb43f10e018b88dbbc8148cf8575cc31ec87cd05b7c7c66fc3ffa70d23a298
3
+ size 491461384
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00005-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:473af91a61fdbb15570789c55cb85e3425f505e4895448a1babcd68abfeacacf
3
+ size 490155080
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00006-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be40dce89b1ecd5b5476d491f252e4852c9a9369e48162b4ef7797c13f8508f5
3
+ size 489402848
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00007-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:994f608ee9089d45f43c3e577afc73597855c98e42436621f5b8a9f4d1959fd4
3
+ size 488308088
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00008-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13f1a9ef28679b02cdf49b5fb26401b6a6fd819ce26b2d6ef4c9cc8486b5e5a8
3
+ size 488565816
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00009-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42fab66a9a83a5b7ab14f4d01949e5591168d3bdc37ab76f3abcda043b63c0b8
3
+ size 487136800
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00010-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09b5a103a3d305405890db95b82de8624cf57944a12bb7fe3b24644af3ef96bb
3
+ size 487375280
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00011-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95cec0ffca845fb12bcd511f876f98dfd8ef8420a9e0ec030547ed6bde47210a
3
+ size 486001120
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00012-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37149287fd5f4e70df2d00dc54b0db0c5f4e590b449b69b3553ebb00ff8ae208
3
+ size 485408104
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00013-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a16996dc09ccca1e1c3e9063e4dadb3b6329ad4081f35454bd261c0de774a911
3
+ size 485633824
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00014-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77f4beaefc1a08ae233e2ab5644e103af3f0c11284b0b49553d1c11c884ef69f
3
+ size 485841176
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00015-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27d38dbf2c523723e5f3e29416ebca7a70b0002124372e2d40796bbaec966456
3
+ size 484020296
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00016-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0931dd1c981afabcc52e6ace64b986b7683e244be585cf5598c7cbe3165308b
3
+ size 484883888
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00017-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e0002265365f32697aa9b970443b6a4b4bdbf245fdafc30f4cececb6a7d40ee
3
+ size 483414096
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00018-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9ed92e0b1c0b107eb1386dd6c498ee8df15b42a079bb80650b26f2de3b75475
3
+ size 481238056
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00019-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d61618770e6ebad69b32e7d3b0b93a94cf2ea9cbc467cedf1fcccb1dbad6c2c
3
+ size 481634360
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00020-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:996aeae99180d8d4c3e3c1db18bfc12f37707d47d584e68255edb71e9ebc99da
3
+ size 483933744
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00021-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37561a782f79cf1cce57a63cd6a429cf0cebba33f6c4339d2e75f4e63da00217
3
+ size 481501560
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00022-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc4e8f69b678826007870368b3bd9b4a46f4f055aaeabfecb6dc4ca946a9a0e6
3
+ size 478094088
data/smiles/11M_smiles_old_tokenizer_no_limit/train/data-00023-of-00024.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e8ce766667969f716e2145501823d3d2ec22e3135297b24d0fa33589acf778d
3
+ size 478855336
data/smiles/11M_smiles_old_tokenizer_no_limit/train/dataset_info.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "attention_mask": {
6
+ "feature": {
7
+ "feature": {
8
+ "dtype": "int8",
9
+ "_type": "Value"
10
+ },
11
+ "_type": "Sequence"
12
+ },
13
+ "_type": "Sequence"
14
+ },
15
+ "input_ids": {
16
+ "feature": {
17
+ "feature": {
18
+ "dtype": "int32",
19
+ "_type": "Value"
20
+ },
21
+ "_type": "Sequence"
22
+ },
23
+ "_type": "Sequence"
24
+ },
25
+ "labels": {
26
+ "feature": {
27
+ "dtype": "string",
28
+ "_type": "Value"
29
+ },
30
+ "_type": "Sequence"
31
+ }
32
+ },
33
+ "homepage": "",
34
+ "license": ""
35
+ }
data/smiles/11M_smiles_old_tokenizer_no_limit/train/state.json ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00024.arrow"
5
+ },
6
+ {
7
+ "filename": "data-00001-of-00024.arrow"
8
+ },
9
+ {
10
+ "filename": "data-00002-of-00024.arrow"
11
+ },
12
+ {
13
+ "filename": "data-00003-of-00024.arrow"
14
+ },
15
+ {
16
+ "filename": "data-00004-of-00024.arrow"
17
+ },
18
+ {
19
+ "filename": "data-00005-of-00024.arrow"
20
+ },
21
+ {
22
+ "filename": "data-00006-of-00024.arrow"
23
+ },
24
+ {
25
+ "filename": "data-00007-of-00024.arrow"
26
+ },
27
+ {
28
+ "filename": "data-00008-of-00024.arrow"
29
+ },
30
+ {
31
+ "filename": "data-00009-of-00024.arrow"
32
+ },
33
+ {
34
+ "filename": "data-00010-of-00024.arrow"
35
+ },
36
+ {
37
+ "filename": "data-00011-of-00024.arrow"
38
+ },
39
+ {
40
+ "filename": "data-00012-of-00024.arrow"
41
+ },
42
+ {
43
+ "filename": "data-00013-of-00024.arrow"
44
+ },
45
+ {
46
+ "filename": "data-00014-of-00024.arrow"
47
+ },
48
+ {
49
+ "filename": "data-00015-of-00024.arrow"
50
+ },
51
+ {
52
+ "filename": "data-00016-of-00024.arrow"
53
+ },
54
+ {
55
+ "filename": "data-00017-of-00024.arrow"
56
+ },
57
+ {
58
+ "filename": "data-00018-of-00024.arrow"
59
+ },
60
+ {
61
+ "filename": "data-00019-of-00024.arrow"
62
+ },
63
+ {
64
+ "filename": "data-00020-of-00024.arrow"
65
+ },
66
+ {
67
+ "filename": "data-00021-of-00024.arrow"
68
+ },
69
+ {
70
+ "filename": "data-00022-of-00024.arrow"
71
+ },
72
+ {
73
+ "filename": "data-00023-of-00024.arrow"
74
+ }
75
+ ],
76
+ "_fingerprint": "f065af1d67a7eca3",
77
+ "_format_columns": null,
78
+ "_format_kwargs": {},
79
+ "_format_type": null,
80
+ "_output_all_columns": false,
81
+ "_split": null
82
+ }
data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00000-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b81ab15dbcc73e25ca02140b3c101a3fd2407660a59fbf81ccbc3af96fe38e0f
3
+ size 479009832
data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00001-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43ac35ede1a6adf2ca8159a568a8302fad4f6ea04c0a1ecda428b6e1a2e849de
3
+ size 483483872
data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00002-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c075e124ba0c213d76552b930f7dae7a8823202e3ac2939196b730fe105b7090
3
+ size 481710208
data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00003-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:576facb6744a5930942432cef811962a92e8f9d3274dfe2fdb40c72273e14ea0
3
+ size 480840896
data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00004-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d35117920343110ba621ee376db7eccdb96d1f6084ed773142aa1948ef1fe14d
3
+ size 480085056
data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00005-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65023f39b5a8856a3e85307b7a36b33fa31591a0d0d3ae1fc6cf86775d0e6f57
3
+ size 479035216
data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00006-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76246331cfb5db44e2ef1d6e2b42636cc270a5bb8f457ccca09598aa909b2818
3
+ size 478170416
data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00007-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60e0c9660f347e5bc71c5065349fbe4026f495321e81975996066b59361d50b3
3
+ size 477289400
data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00008-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb17deb75bc25626877746d2f39a029a036d6a8ee180a9a6bfa1dd6c99b4b793
3
+ size 478124120
data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00009-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a1721819aec27ca4c06b12ee9f3cd4aa5b5ea3ed1ac5d956398ab2790e8c921
3
+ size 474046656
data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00010-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59d5ae6b9357e5cc2909b9f9b219a2989cc64ab686eb770eb4b4d3836b9563d8
3
+ size 475710864
data/smiles/11M_smiles_old_tokenizer_no_limit/val/data-00011-of-00012.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d18bc2c8060542c65d5928816e744ecb8050ca0cac22e6d0ddd0fc9d556d8df9
3
+ size 471405008
data/smiles/11M_smiles_old_tokenizer_no_limit/val/dataset_info.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "attention_mask": {
6
+ "feature": {
7
+ "feature": {
8
+ "dtype": "int8",
9
+ "_type": "Value"
10
+ },
11
+ "_type": "Sequence"
12
+ },
13
+ "_type": "Sequence"
14
+ },
15
+ "input_ids": {
16
+ "feature": {
17
+ "feature": {
18
+ "dtype": "int32",
19
+ "_type": "Value"
20
+ },
21
+ "_type": "Sequence"
22
+ },
23
+ "_type": "Sequence"
24
+ },
25
+ "labels": {
26
+ "feature": {
27
+ "dtype": "string",
28
+ "_type": "Value"
29
+ },
30
+ "_type": "Sequence"
31
+ }
32
+ },
33
+ "homepage": "",
34
+ "license": ""
35
+ }
data/smiles/11M_smiles_old_tokenizer_no_limit/val/state.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00012.arrow"
5
+ },
6
+ {
7
+ "filename": "data-00001-of-00012.arrow"
8
+ },
9
+ {
10
+ "filename": "data-00002-of-00012.arrow"
11
+ },
12
+ {
13
+ "filename": "data-00003-of-00012.arrow"
14
+ },
15
+ {
16
+ "filename": "data-00004-of-00012.arrow"
17
+ },
18
+ {
19
+ "filename": "data-00005-of-00012.arrow"
20
+ },
21
+ {
22
+ "filename": "data-00006-of-00012.arrow"
23
+ },
24
+ {
25
+ "filename": "data-00007-of-00012.arrow"
26
+ },
27
+ {
28
+ "filename": "data-00008-of-00012.arrow"
29
+ },
30
+ {
31
+ "filename": "data-00009-of-00012.arrow"
32
+ },
33
+ {
34
+ "filename": "data-00010-of-00012.arrow"
35
+ },
36
+ {
37
+ "filename": "data-00011-of-00012.arrow"
38
+ }
39
+ ],
40
+ "_fingerprint": "327dd7ee149704f5",
41
+ "_format_columns": null,
42
+ "_format_kwargs": {},
43
+ "_format_type": null,
44
+ "_output_all_columns": false,
45
+ "_split": null
46
+ }
dataset.py CHANGED
@@ -2,7 +2,7 @@
2
  import re
3
  import torch
4
 
5
- import utils
6
 
7
  from torch.utils.data import Dataset, DataLoader
8
  import lightning.pytorch as pl
 
2
  import re
3
  import torch
4
 
5
+ from .utils import utils
6
 
7
  from torch.utils.data import Dataset, DataLoader
8
  import lightning.pytorch as pl
main.py CHANGED
@@ -17,16 +17,17 @@ import sys
17
  import torch.distributed as dist
18
  from torch.nn.parallel import DistributedDataParallel as DDP
19
 
20
- import dataset as dataloader
21
- import dataloading_for_dynamic_batching as dynamic_dataloader
22
- from diffusion import Diffusion
23
- import utils.utils as utils
24
- from new_tokenizer.ape_tokenizer import APETokenizer
 
 
25
 
26
  from lightning.pytorch.strategies import DDPStrategy
27
  from datasets import load_dataset
28
- from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
29
- from helm_tokenizer.helm_tokenizer import HelmTokenizer
30
 
31
 
32
  omegaconf.OmegaConf.register_new_resolver('cwd', os.getcwd)
@@ -51,201 +52,201 @@ def _load_from_checkpoint(config, tokenizer):
51
 
52
  @L.pytorch.utilities.rank_zero_only
53
  def print_config(
54
- config: omegaconf.DictConfig,
55
- resolve: bool = True,
56
- save_cfg: bool = True) -> None:
57
- """
58
- Prints content of DictConfig using Rich library and its tree structure.
59
-
60
- Args:
61
- config (DictConfig): Configuration composed by Hydra.
62
- resolve (bool): Whether to resolve reference fields of DictConfig.
63
- save_cfg (bool): Whether to save the configuration tree to a file.
64
- """
65
-
66
- style = 'dim'
67
- tree = rich.tree.Tree('CONFIG', style=style, guide_style=style)
68
-
69
- fields = config.keys()
70
- for field in fields:
71
- branch = tree.add(field, style=style, guide_style=style)
72
-
73
- config_section = config.get(field)
74
- branch_content = str(config_section)
75
- if isinstance(config_section, omegaconf.DictConfig):
76
- branch_content = omegaconf.OmegaConf.to_yaml(
77
- config_section, resolve=resolve)
78
-
79
- branch.add(rich.syntax.Syntax(branch_content, 'yaml'))
80
- rich.print(tree)
81
- if save_cfg:
82
- with fsspec.open(
83
- '{}/config_tree.txt'.format(
84
- config.checkpointing.save_dir), 'w') as fp:
85
- rich.print(tree, file=fp)
86
 
87
 
88
  @L.pytorch.utilities.rank_zero_only
89
  def print_batch(train_ds, valid_ds, tokenizer, k=64):
90
- #for dl_type, dl in [
91
  #('train', train_ds), ('valid', valid_ds)]:
92
 
93
- for dl_type, dl in [
94
- ('train', train_ds)]:
95
- print(f'Printing {dl_type} dataloader batch.')
96
- batch = next(iter(dl))
97
- print('Batch input_ids.shape', batch['input_ids'].shape)
98
- first = batch['input_ids'][0, :k]
99
- last = batch['input_ids'][0, -k:]
100
- print(f'First {k} tokens:', tokenizer.decode(first))
101
- print('ids:', first)
102
- print(f'Last {k} tokens:', tokenizer.decode(last))
103
- print('ids:', last)
104
 
105
 
106
  def generate_samples(config, logger, tokenizer):
107
- logger.info('Generating samples.')
108
- model = _load_from_checkpoint(config=config, tokenizer=tokenizer)
109
- # model.gen_ppl_metric.reset()
110
-
111
- #stride_length = config.sampling.stride_length
112
- #num_strides = config.sampling.num_strides
113
 
114
- for _ in range(config.sampling.num_sample_batches):
115
- samples = model.restore_model_and_sample(num_steps=config.sampling.steps)
116
- peptide_sequences = model.tokenizer.batch_decode(samples)
117
- model.compute_generative_perplexity(peptide_sequences)
118
 
119
- print('Peptide samples:', peptide_sequences)
120
 
121
- print('Generative perplexity:', model.compute_masked_perplexity())
122
 
123
- return peptide_sequences
124
 
125
 
126
  def ppl_eval(config, logger, tokenizer, data_module):
127
- logger.info('Starting Zero Shot Eval.')
128
 
129
- model = _load_from_checkpoint(config=config, tokenizer=tokenizer)
130
 
131
- wandb_logger = None
132
- if config.get('wandb', None) is not None:
133
- wandb_logger = L.pytorch.loggers.WandbLogger(
134
- config=omegaconf.OmegaConf.to_object(config),
135
- ** config.wandb)
136
 
137
- callbacks = []
138
 
139
- if 'callbacks' in config:
140
- for _, callback in config.callbacks.items():
141
- callbacks.append(hydra.utils.instantiate(callback))
142
 
143
- trainer = hydra.utils.instantiate(
144
- config.trainer,
145
- default_root_dir=os.getcwd(),
146
- callbacks=callbacks,
147
- strategy=DDPStrategy(find_unused_parameters = True),
148
- logger=wandb_logger)
149
 
150
- #_, valid_ds = dataloader.get_dataloaders(config, tokenizer, skiptrain=True, valid_seed=config.seed)
151
- trainer.test(model, data_module)
152
 
153
 
154
  def _train(config, logger, tokenizer, data_module):
155
- logger.info('Starting Training.')
156
- wandb_logger = None
157
-
158
- if config.get('wandb', None) is not None:
159
- unique_id = str(uuid.uuid4())
160
-
161
- config.wandb.id = f"{config.wandb.id}_{unique_id}"
162
-
163
- wandb_logger = L.pytorch.loggers.WandbLogger(
164
- config=omegaconf.OmegaConf.to_object(config),
165
- ** config.wandb)
166
-
167
- if (config.checkpointing.resume_from_ckpt
168
- and config.checkpointing.resume_ckpt_path is not None
169
- and utils.fsspec_exists(
170
- config.checkpointing.resume_ckpt_path)):
171
- ckpt_path = config.checkpointing.resume_ckpt_path
172
- else:
173
- ckpt_path = None
174
-
175
- # Lightning callbacks
176
- callbacks = []
177
- if 'callbacks' in config:
178
- for callback_name, callback_config in config.callbacks.items():
179
- if callback_name == 'model_checkpoint':
180
- model_checkpoint_config = {k: v for k, v in callback_config.items() if k != '_target_'}
181
- callbacks.append(ModelCheckpoint(**model_checkpoint_config))
182
- else:
183
- callbacks.append(hydra.utils.instantiate(callback_config))
184
 
185
- if config.training.accumulator:
186
- accumulator = GradientAccumulationScheduler(scheduling = {1: 5, 2: 4, 3: 3, 4: 1})
187
- callbacks.append(accumulator)
188
 
189
- trainer = hydra.utils.instantiate(
190
- config.trainer,
191
- default_root_dir=os.getcwd(),
192
- callbacks=callbacks,
193
- accelerator='cuda',
194
- strategy=DDPStrategy(find_unused_parameters = True),
195
- devices=[2,3,4,5,6,7],
196
- logger=wandb_logger)
197
-
198
- model = Diffusion(config, tokenizer=tokenizer)
199
-
200
- if config.backbone == 'finetune_roformer' and config.eval.checkpoint_path:
201
- checkpoint = torch.load(config.eval.checkpoint_path, map_location="cpu")
202
  state = checkpoint.get("state_dict", checkpoint)
203
- model.load_state_dict(state, strict=False)
204
-
205
- trainer.fit(model, datamodule=data_module, ckpt_path=ckpt_path)
206
 
207
 
208
  @hydra.main(version_base=None, config_path='configs', config_name='config')
209
  def main(config):
210
- """
211
- Main entry point for training
212
  """
213
- L.seed_everything(config.seed)
214
 
215
- # print_config(config, resolve=True, save_cfg=True)
216
 
217
- logger = utils.get_logger(__name__)
218
- # load PeptideCLM tokenizer
219
- tok_dir = config.paths.tokenizers
220
  if config.vocab == 'new_smiles':
221
- tokenizer = APETokenizer()
222
- tokenizer.load_vocabulary(f'{tok_dir}/peptide_smiles_600_vocab.json')
223
- elif config.vocab == 'old_smiles':
224
- tokenizer = SMILES_SPE_Tokenizer(f'{tok_dir}/new_vocab.txt',
225
  f'{tok_dir}/new_splits.txt')
226
- elif config.vocab == 'selfies':
227
- tokenizer = APETokenizer()
228
- tokenizer.load_vocabulary(f'{tok_dir}/peptide_selfies_600_vocab.json')
229
- elif config.vocab == 'helm':
230
- tokenizer = HelmTokenizer(f'{tok_dir}/monomer_vocab.txt')
231
-
232
- if config.backbone == 'finetune_roformer':
233
- train_dataset = load_dataset('csv', data_files=config.data.train)
234
- val_dataset = load_dataset('csv', data_files=config.data.valid)
235
-
236
- train_dataset = train_dataset['train']#.select(lst)
237
- val_dataset = val_dataset['train']#.select(lst)
238
- data_module = dataloader.CustomDataModule(train_dataset, val_dataset, None, tokenizer, batch_size=config.loader.global_batch_size)
239
- else:
240
- data_module = dynamic_dataloader.CustomDataModule(f'{config.paths.data}/smiles/11M_smiles_old_tokenizer_no_limit', tokenizer)
241
-
242
- if config.mode == 'sample_eval':
243
- generate_samples(config, logger, tokenizer)
244
- elif config.mode == 'ppl_eval':
245
- ppl_eval(config, logger, tokenizer, data_module)
246
- else:
247
- _train(config, logger, tokenizer, data_module)
248
 
249
 
250
  if __name__ == '__main__':
251
- main()
 
17
  import torch.distributed as dist
18
  from torch.nn.parallel import DistributedDataParallel as DDP
19
 
20
+ from . import dataset as dataloader
21
+ from . import dataloading_for_dynamic_batching as dynamic_dataloader
22
+ from .diffusion import Diffusion
23
+ from .utils import utils
24
+ from .new_tokenizer.ape_tokenizer import APETokenizer
25
+ from .tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
26
+ from .helm_tokenizer.helm_tokenizer import HelmTokenizer
27
 
28
  from lightning.pytorch.strategies import DDPStrategy
29
  from datasets import load_dataset
30
+
 
31
 
32
 
33
  omegaconf.OmegaConf.register_new_resolver('cwd', os.getcwd)
 
52
 
53
  @L.pytorch.utilities.rank_zero_only
54
  def print_config(
55
+ config: omegaconf.DictConfig,
56
+ resolve: bool = True,
57
+ save_cfg: bool = True) -> None:
58
+ """
59
+ Prints content of DictConfig using Rich library and its tree structure.
60
+
61
+ Args:
62
+ config (DictConfig): Configuration composed by Hydra.
63
+ resolve (bool): Whether to resolve reference fields of DictConfig.
64
+ save_cfg (bool): Whether to save the configuration tree to a file.
65
+ """
66
+
67
+ style = 'dim'
68
+ tree = rich.tree.Tree('CONFIG', style=style, guide_style=style)
69
+
70
+ fields = config.keys()
71
+ for field in fields:
72
+ branch = tree.add(field, style=style, guide_style=style)
73
+
74
+ config_section = config.get(field)
75
+ branch_content = str(config_section)
76
+ if isinstance(config_section, omegaconf.DictConfig):
77
+ branch_content = omegaconf.OmegaConf.to_yaml(
78
+ config_section, resolve=resolve)
79
+
80
+ branch.add(rich.syntax.Syntax(branch_content, 'yaml'))
81
+ rich.print(tree)
82
+ if save_cfg:
83
+ with fsspec.open(
84
+ '{}/config_tree.txt'.format(
85
+ config.checkpointing.save_dir), 'w') as fp:
86
+ rich.print(tree, file=fp)
87
 
88
 
89
  @L.pytorch.utilities.rank_zero_only
90
  def print_batch(train_ds, valid_ds, tokenizer, k=64):
91
+ #for dl_type, dl in [
92
  #('train', train_ds), ('valid', valid_ds)]:
93
 
94
+ for dl_type, dl in [
95
+ ('train', train_ds)]:
96
+ print(f'Printing {dl_type} dataloader batch.')
97
+ batch = next(iter(dl))
98
+ print('Batch input_ids.shape', batch['input_ids'].shape)
99
+ first = batch['input_ids'][0, :k]
100
+ last = batch['input_ids'][0, -k:]
101
+ print(f'First {k} tokens:', tokenizer.decode(first))
102
+ print('ids:', first)
103
+ print(f'Last {k} tokens:', tokenizer.decode(last))
104
+ print('ids:', last)
105
 
106
 
107
  def generate_samples(config, logger, tokenizer):
108
+ logger.info('Generating samples.')
109
+ model = _load_from_checkpoint(config=config, tokenizer=tokenizer)
110
+ # model.gen_ppl_metric.reset()
111
+
112
+ #stride_length = config.sampling.stride_length
113
+ #num_strides = config.sampling.num_strides
114
 
115
+ for _ in range(config.sampling.num_sample_batches):
116
+ samples = model.restore_model_and_sample(num_steps=config.sampling.steps)
117
+ peptide_sequences = model.tokenizer.batch_decode(samples)
118
+ model.compute_generative_perplexity(peptide_sequences)
119
 
120
+ print('Peptide samples:', peptide_sequences)
121
 
122
+ print('Generative perplexity:', model.compute_masked_perplexity())
123
 
124
+ return peptide_sequences
125
 
126
 
127
  def ppl_eval(config, logger, tokenizer, data_module):
128
+ logger.info('Starting Zero Shot Eval.')
129
 
130
+ model = _load_from_checkpoint(config=config, tokenizer=tokenizer)
131
 
132
+ wandb_logger = None
133
+ if config.get('wandb', None) is not None:
134
+ wandb_logger = L.pytorch.loggers.WandbLogger(
135
+ config=omegaconf.OmegaConf.to_object(config),
136
+ ** config.wandb)
137
 
138
+ callbacks = []
139
 
140
+ if 'callbacks' in config:
141
+ for _, callback in config.callbacks.items():
142
+ callbacks.append(hydra.utils.instantiate(callback))
143
 
144
+ trainer = hydra.utils.instantiate(
145
+ config.trainer,
146
+ default_root_dir=os.getcwd(),
147
+ callbacks=callbacks,
148
+ strategy=DDPStrategy(find_unused_parameters = True),
149
+ logger=wandb_logger)
150
 
151
+ #_, valid_ds = dataloader.get_dataloaders(config, tokenizer, skiptrain=True, valid_seed=config.seed)
152
+ trainer.test(model, data_module)
153
 
154
 
155
  def _train(config, logger, tokenizer, data_module):
156
+ logger.info('Starting Training.')
157
+ wandb_logger = None
158
+
159
+ if config.get('wandb', None) is not None:
160
+ unique_id = str(uuid.uuid4())
161
+
162
+ config.wandb.id = f"{config.wandb.id}_{unique_id}"
163
+
164
+ wandb_logger = L.pytorch.loggers.WandbLogger(
165
+ config=omegaconf.OmegaConf.to_object(config),
166
+ ** config.wandb)
167
+
168
+ if (config.checkpointing.resume_from_ckpt
169
+ and config.checkpointing.resume_ckpt_path is not None
170
+ and utils.fsspec_exists(
171
+ config.checkpointing.resume_ckpt_path)):
172
+ ckpt_path = config.checkpointing.resume_ckpt_path
173
+ else:
174
+ ckpt_path = None
175
+
176
+ # Lightning callbacks
177
+ callbacks = []
178
+ if 'callbacks' in config:
179
+ for callback_name, callback_config in config.callbacks.items():
180
+ if callback_name == 'model_checkpoint':
181
+ model_checkpoint_config = {k: v for k, v in callback_config.items() if k != '_target_'}
182
+ callbacks.append(ModelCheckpoint(**model_checkpoint_config))
183
+ else:
184
+ callbacks.append(hydra.utils.instantiate(callback_config))
185
 
186
+ if config.training.accumulator:
187
+ accumulator = GradientAccumulationScheduler(scheduling = {1: 5, 2: 4, 3: 3, 4: 1})
188
+ callbacks.append(accumulator)
189
 
190
+ trainer = hydra.utils.instantiate(
191
+ config.trainer,
192
+ default_root_dir=os.getcwd(),
193
+ callbacks=callbacks,
194
+ accelerator='cuda',
195
+ strategy=DDPStrategy(find_unused_parameters = True),
196
+ devices=[2,3,4,5,6,7],
197
+ logger=wandb_logger)
198
+
199
+ model = Diffusion(config, tokenizer=tokenizer)
200
+
201
+ if config.backbone == "finetune_roformer" and config.eval.checkpoint_path:
202
+ checkpoint = torch.load(config.eval.checkpoint_path, map_location="cpu")
203
  state = checkpoint.get("state_dict", checkpoint)
204
+ model.load_state_dict(state, strict=False)
205
+
206
+ trainer.fit(model, datamodule=data_module, ckpt_path=ckpt_path)
207
 
208
 
209
  @hydra.main(version_base=None, config_path='configs', config_name='config')
210
  def main(config):
211
+ """
212
+ Main entry point for training
213
  """
214
+ L.seed_everything(config.seed)
215
 
216
+ # print_config(config, resolve=True, save_cfg=True)
217
 
218
+ logger = utils.get_logger(__name__)
219
+ # load PeptideCLM tokenizer
220
+ tok_dir = config.paths.tokenizers
221
  if config.vocab == 'new_smiles':
222
+ tokenizer = APETokenizer()
223
+ tokenizer.load_vocabulary(f'{tok_dir}/peptide_smiles_600_vocab.json')
224
+ elif config.vocab == 'old_smiles':
225
+ tokenizer = SMILES_SPE_Tokenizer(f'{tok_dir}/new_vocab.txt',
226
  f'{tok_dir}/new_splits.txt')
227
+ elif config.vocab == 'selfies':
228
+ tokenizer = APETokenizer()
229
+ tokenizer.load_vocabulary(f'{tok_dir}/peptide_selfies_600_vocab.json')
230
+ elif config.vocab == 'helm':
231
+ tokenizer = HelmTokenizer(f'{tok_dir}/monomer_vocab.txt')
232
+
233
+ if config.backbone == 'finetune_roformer':
234
+ train_dataset = load_dataset('csv', data_files=config.data.train)
235
+ val_dataset = load_dataset('csv', data_files=config.data.valid)
236
+
237
+ train_dataset = train_dataset['train']#.select(lst)
238
+ val_dataset = val_dataset['train']#.select(lst)
239
+ data_module = dataloader.CustomDataModule(train_dataset, val_dataset, None, tokenizer, batch_size=config.loader.global_batch_size)
240
+ else:
241
+ data_module = dynamic_dataloader.CustomDataModule(f'{config.paths.data}/smiles/11M_smiles_old_tokenizer_no_limit', tokenizer)
242
+
243
+ if config.mode == 'sample_eval':
244
+ generate_samples(config, logger, tokenizer)
245
+ elif config.mode == 'ppl_eval':
246
+ ppl_eval(config, logger, tokenizer, data_module)
247
+ else:
248
+ _train(config, logger, tokenizer, data_module)
249
 
250
 
251
  if __name__ == '__main__':
252
+ main()
utils/.ipynb_checkpoints/app-checkpoint.py ADDED
@@ -0,0 +1,1255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import pandas as pd
4
+ from io import StringIO
5
+ import rdkit
6
+ from rdkit import Chem
7
+ from rdkit.Chem import AllChem, Draw
8
+ import numpy as np
9
+ from PIL import Image, ImageDraw, ImageFont
10
+ import matplotlib.pyplot as plt
11
+ import matplotlib.patches as patches
12
+ from io import BytesIO
13
+ import tempfile
14
+ from rdkit import Chem
15
+
16
+ class PeptideAnalyzer:
17
+ def __init__(self):
18
+ self.bond_patterns = [
19
+ (r'OC\(=O\)', 'ester'), # Ester bond
20
+ (r'N\(C\)C\(=O\)', 'n_methyl'), # N-methylated peptide bond
21
+ (r'N[0-9]C\(=O\)', 'proline'), # Proline peptide bond
22
+ (r'NC\(=O\)', 'peptide'), # Standard peptide bond
23
+ (r'C\(=O\)N\(C\)', 'n_methyl_reverse'), # Reverse N-methylated
24
+ (r'C\(=O\)N[12]?', 'peptide_reverse') # Reverse peptide bond
25
+ ]
26
+ # Three to one letter code mapping
27
+ self.three_to_one = {
28
+ 'Ala': 'A', 'Cys': 'C', 'Asp': 'D', 'Glu': 'E',
29
+ 'Phe': 'F', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
30
+ 'Lys': 'K', 'Leu': 'L', 'Met': 'M', 'Asn': 'N',
31
+ 'Pro': 'P', 'Gln': 'Q', 'Arg': 'R', 'Ser': 'S',
32
+ 'Thr': 'T', 'Val': 'V', 'Trp': 'W', 'Tyr': 'Y'
33
+ }
34
+
35
+ def is_peptide(self, smiles):
36
+ """Check if the SMILES represents a peptide structure"""
37
+ mol = Chem.MolFromSmiles(smiles)
38
+ if mol is None:
39
+ return False
40
+
41
+ # Look for peptide bonds: NC(=O) pattern
42
+ peptide_bond_pattern = Chem.MolFromSmarts('[NH][C](=O)')
43
+ if mol.HasSubstructMatch(peptide_bond_pattern):
44
+ return True
45
+
46
+ # Look for N-methylated peptide bonds: N(C)C(=O) pattern
47
+ n_methyl_pattern = Chem.MolFromSmarts('[N;H0;$(NC)](C)[C](=O)')
48
+ if mol.HasSubstructMatch(n_methyl_pattern):
49
+ return True
50
+
51
+ return False
52
+
53
+ def is_cyclic(self, smiles):
54
+ """Improved cyclic peptide detection"""
55
+ # Check for C-terminal carboxyl
56
+ if smiles.endswith('C(=O)O'):
57
+ return False, [], []
58
+
59
+ # Find all numbers used in ring closures
60
+ ring_numbers = re.findall(r'(?:^|[^c])[0-9](?=[A-Z@\(\)])', smiles)
61
+
62
+ # Find aromatic ring numbers
63
+ aromatic_matches = re.findall(r'c[0-9](?:ccccc|c\[nH\]c)[0-9]', smiles)
64
+ aromatic_cycles = []
65
+ for match in aromatic_matches:
66
+ numbers = re.findall(r'[0-9]', match)
67
+ aromatic_cycles.extend(numbers)
68
+
69
+ # Numbers that aren't part of aromatic rings are peptide cycles
70
+ peptide_cycles = [n for n in ring_numbers if n not in aromatic_cycles]
71
+
72
+ is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
73
+ return is_cyclic, peptide_cycles, aromatic_cycles
74
+
75
+ def split_on_bonds(self, smiles):
76
+ """Split SMILES into segments with simplified Pro handling"""
77
+ positions = []
78
+ used = set()
79
+
80
+ # Find Gly pattern first
81
+ gly_pattern = r'NCC\(=O\)'
82
+ for match in re.finditer(gly_pattern, smiles):
83
+ if not any(p in range(match.start(), match.end()) for p in used):
84
+ positions.append({
85
+ 'start': match.start(),
86
+ 'end': match.end(),
87
+ 'type': 'gly',
88
+ 'pattern': match.group()
89
+ })
90
+ used.update(range(match.start(), match.end()))
91
+
92
+ for pattern, bond_type in self.bond_patterns:
93
+ for match in re.finditer(pattern, smiles):
94
+ if not any(p in range(match.start(), match.end()) for p in used):
95
+ positions.append({
96
+ 'start': match.start(),
97
+ 'end': match.end(),
98
+ 'type': bond_type,
99
+ 'pattern': match.group()
100
+ })
101
+ used.update(range(match.start(), match.end()))
102
+
103
+ # Sort by position
104
+ positions.sort(key=lambda x: x['start'])
105
+
106
+ # Create segments
107
+ segments = []
108
+
109
+ if positions:
110
+ # First segment
111
+ if positions[0]['start'] > 0:
112
+ segments.append({
113
+ 'content': smiles[0:positions[0]['start']],
114
+ 'bond_after': positions[0]['pattern']
115
+ })
116
+
117
+ # Process segments
118
+ for i in range(len(positions)-1):
119
+ current = positions[i]
120
+ next_pos = positions[i+1]
121
+
122
+ if current['type'] == 'gly':
123
+ segments.append({
124
+ 'content': 'NCC(=O)',
125
+ 'bond_before': positions[i-1]['pattern'] if i > 0 else None,
126
+ 'bond_after': next_pos['pattern']
127
+ })
128
+ else:
129
+ content = smiles[current['end']:next_pos['start']]
130
+ if content:
131
+ segments.append({
132
+ 'content': content,
133
+ 'bond_before': current['pattern'],
134
+ 'bond_after': next_pos['pattern']
135
+ })
136
+
137
+ # Last segment
138
+ if positions[-1]['end'] < len(smiles):
139
+ segments.append({
140
+ 'content': smiles[positions[-1]['end']:],
141
+ 'bond_before': positions[-1]['pattern']
142
+ })
143
+
144
+ return segments
145
+
146
+ def clean_terminal_carboxyl(self, segment):
147
+ """Remove C-terminal carboxyl only if it's the true terminus"""
148
+ content = segment['content']
149
+
150
+ # Only clean if:
151
+ # 1. Contains C(=O)O
152
+ # 2. No bond_after exists (meaning it's the last segment)
153
+ # 3. C(=O)O is at the end of the content
154
+ if 'C(=O)O' in content and not segment.get('bond_after'):
155
+ print('recognized?')
156
+ # Remove C(=O)O pattern regardless of position
157
+ cleaned = re.sub(r'\(C\(=O\)O\)', '', content)
158
+ # Remove any leftover empty parentheses
159
+ cleaned = re.sub(r'\(\)', '', cleaned)
160
+ print(cleaned)
161
+ return cleaned
162
+ return content
163
+
164
+ def identify_residue(self, segment):
165
+ """Identify residue with Pro reconstruction"""
166
+ # Only clean terminal carboxyl if this is the last segment
167
+ content = self.clean_terminal_carboxyl(segment)
168
+ mods = self.get_modifications(segment)
169
+
170
+ # UAA pattern matching section - before regular residues
171
+ # Phenylglycine and derivatives
172
+ if 'c1ccccc1' in content:
173
+ if '[C@@H](c1ccccc1)' in content or '[C@H](c1ccccc1)' in content:
174
+ return '4', mods # Base phenylglycine
175
+
176
+ # 4-substituted phenylalanines
177
+ if 'Cc1ccc' in content:
178
+ if 'OMe' in content or 'OCc1ccc' in content:
179
+ return '0A1', mods # 4-methoxy-Phenylalanine
180
+ elif 'Clc1ccc' in content:
181
+ return '200', mods # 4-chloro-Phenylalanine
182
+ elif 'Brc1ccc' in content:
183
+ return '4BF', mods # 4-Bromo-phenylalanine
184
+ elif 'C#Nc1ccc' in content:
185
+ return '4CF', mods # 4-cyano-phenylalanine
186
+ elif 'Ic1ccc' in content:
187
+ return 'PHI', mods # 4-Iodo-phenylalanine
188
+ elif 'Fc1ccc' in content:
189
+ return 'PFF', mods # 4-Fluoro-phenylalanine
190
+
191
+ # Modified tryptophans
192
+ if 'c[nH]c2' in content:
193
+ if 'Oc2cccc2' in content:
194
+ return '0AF', mods # 7-hydroxy-tryptophan
195
+ elif 'Fc2cccc2' in content:
196
+ return '4FW', mods # 4-fluoro-tryptophan
197
+ elif 'Clc2cccc2' in content:
198
+ return '6CW', mods # 6-chloro-tryptophan
199
+ elif 'Brc2cccc2' in content:
200
+ return 'BTR', mods # 6-bromo-tryptophan
201
+ elif 'COc2cccc2' in content:
202
+ return 'MOT5', mods # 5-Methoxy-tryptophan
203
+ elif 'Cc2cccc2' in content:
204
+ return 'MTR5', mods # 5-Methyl-tryptophan
205
+
206
+ # Special amino acids
207
+ if 'CC(C)(C)[C@@H]' in content or 'CC(C)(C)[C@H]' in content:
208
+ return 'BUG', mods # Tertleucine
209
+
210
+ if 'CCCNC(=N)N' in content:
211
+ return 'CIR', mods # Citrulline
212
+
213
+ if '[SeH]' in content:
214
+ return 'CSE', mods # Selenocysteine
215
+
216
+ if '[NH3]CC[C@@H]' in content or '[NH3]CC[C@H]' in content:
217
+ return 'DAB', mods # Diaminobutyric acid
218
+
219
+ if 'C1CCCCC1' in content:
220
+ if 'C1CCCCC1[C@@H]' in content or 'C1CCCCC1[C@H]' in content:
221
+ return 'CHG', mods # Cyclohexylglycine
222
+ elif 'C1CCCCC1C[C@@H]' in content or 'C1CCCCC1C[C@H]' in content:
223
+ return 'ALC', mods # 3-cyclohexyl-alanine
224
+
225
+ # Naphthalene derivatives
226
+ if 'c1cccc2c1cccc2' in content:
227
+ if 'c1cccc2c1cccc2[C@@H]' in content or 'c1cccc2c1cccc2[C@H]' in content:
228
+ return 'NAL', mods # 2-Naphthyl-alanine
229
+
230
+ # Heteroaromatic derivatives
231
+ if 'c1cncc' in content:
232
+ return 'PYR4', mods # 3-(4-Pyridyl)-alanine
233
+ if 'c1cscc' in content:
234
+ return 'THA3', mods # 3-(3-thienyl)-alanine
235
+ if 'c1nnc' in content:
236
+ return 'TRZ4', mods # 3-(1,2,4-Triazol-1-yl)-alanine
237
+
238
+ # Modified serines and threonines
239
+ if 'OP(O)(O)O' in content:
240
+ if '[C@@H](COP' in content or '[C@H](COP' in content:
241
+ return 'SEP', mods # phosphoserine
242
+ elif '[C@@H](OP' in content or '[C@H](OP' in content:
243
+ return 'TPO', mods # phosphothreonine
244
+
245
+ # Specialized ring systems
246
+ if 'c1c2ccccc2cc2c1cccc2' in content:
247
+ return 'ANTH', mods # 3-(9-anthryl)-alanine
248
+ if 'c1csc2c1cccc2' in content:
249
+ return 'BTH3', mods # 3-(3-benzothienyl)-alanine
250
+ if '[C@]12C[C@H]3C[C@@H](C2)C[C@@H](C1)C3' in content:
251
+ return 'ADAM', mods # Adamanthane
252
+
253
+ # Fluorinated derivatives
254
+ if 'FC(F)(F)' in content:
255
+ if 'CC(F)(F)F' in content:
256
+ return 'FLA', mods # Trifluoro-alanine
257
+ if 'C(F)(F)F)c1' in content:
258
+ if 'c1ccccc1C(F)(F)F' in content:
259
+ return 'TFG2', mods # 2-(Trifluoromethyl)-phenylglycine
260
+ if 'c1cccc(c1)C(F)(F)F' in content:
261
+ return 'TFG3', mods # 3-(Trifluoromethyl)-phenylglycine
262
+ if 'c1ccc(cc1)C(F)(F)F' in content:
263
+ return 'TFG4', mods # 4-(Trifluoromethyl)-phenylglycine
264
+
265
+ # Multiple halogen patterns
266
+ if 'F' in content and 'c1' in content:
267
+ if 'c1ccc(c(c1)F)F' in content:
268
+ return 'F2F', mods # 3,4-Difluoro-phenylalanine
269
+ if 'cc(F)cc(c1)F' in content:
270
+ return 'WFP', mods # 3,5-Difluoro-phenylalanine
271
+ if 'Cl' in content and 'c1' in content:
272
+ if 'c1ccc(cc1Cl)Cl' in content:
273
+ return 'CP24', mods # 2,4-dichloro-phenylalanine
274
+ if 'c1ccc(c(c1)Cl)Cl' in content:
275
+ return 'CP34', mods # 3,4-dichloro-phenylalanine
276
+
277
+ # Hydroxy and amino derivatives
278
+ if 'O' in content and 'c1' in content:
279
+ if 'c1cc(O)cc(c1)O' in content:
280
+ return '3FG', mods # (2s)-amino(3,5-dihydroxyphenyl)-ethanoic acid
281
+ if 'c1ccc(c(c1)O)O' in content:
282
+ return 'DAH', mods # 3,4-Dihydroxy-phenylalanine
283
+
284
+ # Cyclic amino acids
285
+ if 'C1CCCC1' in content:
286
+ return 'CPA3', mods # 3-Cyclopentyl-alanine
287
+ if 'C1CCCCC1' in content:
288
+ if 'CC1CCCCC1' in content:
289
+ return 'ALC', mods # 3-cyclohexyl-alanine
290
+ else:
291
+ return 'CHG', mods # Cyclohexylglycine
292
+
293
+ # Chain-length variants
294
+ if 'CCC[C@@H]' in content or 'CCC[C@H]' in content:
295
+ return 'NLE', mods # Norleucine
296
+ if 'CC[C@@H]' in content or 'CC[C@H]' in content:
297
+ if not any(x in content for x in ['CC(C)', 'COC', 'CN(']):
298
+ return 'ABA', mods # 2-Aminobutyric acid
299
+
300
+ # Modified histidines
301
+ if 'c1cnc' in content:
302
+ if '[C@@H]1CN[C@@H](N1)F' in content:
303
+ return '2HF', mods # 2-fluoro-l-histidine
304
+ if 'c1cnc([nH]1)F' in content:
305
+ return '2HF1', mods # 2-fluoro-l-histidine variant
306
+ if 'c1c[nH]c(n1)F' in content:
307
+ return '2HF2', mods # 2-fluoro-l-histidine variant
308
+
309
+ # Sulfur and selenium containing
310
+ if '[SeH]' in content:
311
+ return 'CSE', mods # Selenocysteine
312
+ if 'S' in content:
313
+ if 'CSCc1ccccc1' in content:
314
+ return 'BCS', mods # benzylcysteine
315
+ if 'CCSC' in content:
316
+ return 'ESC', mods # Ethionine
317
+ if 'CCS' in content:
318
+ return 'HCS', mods # homocysteine
319
+
320
+ # Additional modifications
321
+ if 'CN=[N]=N' in content:
322
+ return 'AZDA', mods # azido-alanine
323
+ if '[NH]=[C](=[NH2])=[NH2]' in content:
324
+ if 'CCC[NH]=' in content:
325
+ return 'AGM', mods # 5-methyl-arginine
326
+ if 'CC[NH]=' in content:
327
+ return 'GDPR', mods # 2-Amino-3-guanidinopropionic acid
328
+
329
+ if 'CCON' in content:
330
+ return 'CAN', mods # canaline
331
+ if '[C@@H]1C=C[C@@H](C=C1)' in content:
332
+ return 'ACZ', mods # cis-amiclenomycin
333
+ if 'CCC(=O)[NH3]' in content:
334
+ return 'ONL', mods # 5-oxo-l-norleucine
335
+ if 'c1ccncc1' in content:
336
+ return 'PYR4', mods # 3-(4-Pyridyl)-alanine
337
+ if 'c1ccco1' in content:
338
+ return 'FUA2', mods # (2-furyl)-alanine
339
+
340
+ if 'c1ccc' in content:
341
+ if 'c1ccc(cc1)c1ccccc1' in content:
342
+ return 'BIF', mods # 4,4-biphenylalanine
343
+ if 'c1ccc(cc1)C(=O)c1ccccc1' in content:
344
+ return 'PBF', mods # 4-benzoyl-phenylalanine
345
+ if 'c1ccc(cc1)C(C)(C)C' in content:
346
+ return 'TBP4', mods # 4-tert-butyl-phenylalanine
347
+ if 'c1ccc(cc1)[C](=[NH2])=[NH2]' in content:
348
+ return '0BN', mods # 4-carbamimidoyl-l-phenylalanine
349
+ if 'c1cccc(c1)[C](=[NH2])=[NH2]' in content:
350
+ return 'APM', mods # m-amidinophenyl-3-alanine
351
+
352
+ # Multiple hydroxy patterns
353
+ if 'O' in content:
354
+ if '[C@H]([C@H](C)O)O' in content:
355
+ return 'ILX', mods # 4,5-dihydroxy-isoleucine
356
+ if '[C@H]([C@@H](C)O)O' in content:
357
+ return 'ALO', mods # Allo-threonine
358
+ if '[C@H](COP(O)(O)O)' in content:
359
+ return 'SEP', mods # phosphoserine
360
+ if '[C@H]([C@@H](C)OP(O)(O)O)' in content:
361
+ return 'TPO', mods # phosphothreonine
362
+ if '[C@H](c1ccc(O)cc1)O' in content:
363
+ return 'OMX', mods # (betar)-beta-hydroxy-l-tyrosine
364
+ if '[C@H](c1ccc(c(Cl)c1)O)O' in content:
365
+ return 'OMY', mods # (betar)-3-chloro-beta-hydroxy-l-tyrosine
366
+
367
+ # Heterocyclic patterns
368
+ if 'n1' in content:
369
+ if 'n1cccn1' in content:
370
+ return 'PYZ1', mods # 3-(1-Pyrazolyl)-alanine
371
+ if 'n1nncn1' in content:
372
+ return 'TEZA', mods # 3-(2-Tetrazolyl)-alanine
373
+ if 'c2c(n1)cccc2' in content:
374
+ return 'QU32', mods # 3-(2-Quinolyl)-alanine
375
+ if 'c1cnc2c(c1)cccc2' in content:
376
+ return 'QU33', mods # 3-(3-quinolyl)-alanine
377
+ if 'c1ccnc2c1cccc2' in content:
378
+ return 'QU34', mods # 3-(4-quinolyl)-alanine
379
+ if 'c1ccc2c(c1)nccc2' in content:
380
+ return 'QU35', mods # 3-(5-Quinolyl)-alanine
381
+ if 'c1ccc2c(c1)cncc2' in content:
382
+ return 'QU36', mods # 3-(6-Quinolyl)-alanine
383
+ if 'c1cnc2c(n1)cccc2' in content:
384
+ return 'QX32', mods # 3-(2-quinoxalyl)-alanine
385
+
386
+ # Multiple nitrogen patterns
387
+ if 'N' in content:
388
+ if '[NH3]CC[C@@H]' in content:
389
+ return 'DAB', mods # Diaminobutyric acid
390
+ if '[NH3]C[C@@H]' in content:
391
+ return 'DPP', mods # 2,3-Diaminopropanoic acid
392
+ if '[NH3]CCCCCC[C@@H]' in content:
393
+ return 'HHK', mods # (2s)-2,8-diaminooctanoic acid
394
+ if 'CCC[NH]=[C](=[NH2])=[NH2]' in content:
395
+ return 'GBUT', mods # 2-Amino-4-guanidinobutryric acid
396
+ if '[NH]=[C](=S)=[NH2]' in content:
397
+ return 'THIC', mods # Thio-citrulline
398
+
399
+ # Chain modified amino acids
400
+ if 'CC' in content:
401
+ if 'CCCC[C@@H]' in content:
402
+ return 'AHP', mods # 2-Aminoheptanoic acid
403
+ if 'CCC([C@@H])(C)C' in content:
404
+ return 'I2M', mods # 3-methyl-l-alloisoleucine
405
+ if 'CC[C@H]([C@@H])C' in content:
406
+ return 'IIL', mods # Allo-Isoleucine
407
+ if '[C@H](CCC(C)C)' in content:
408
+ return 'HLEU', mods # Homoleucine
409
+ if '[C@@H]([C@@H](C)O)C' in content:
410
+ return 'HLU', mods # beta-hydroxyleucine
411
+
412
+ # Modified glutamate/aspartate patterns
413
+ if '[C@@H]' in content:
414
+ if '[C@@H](C[C@@H](F))' in content:
415
+ return 'FGA4', mods # 4-Fluoro-glutamic acid
416
+ if '[C@@H](C[C@@H](O))' in content:
417
+ return '3GL', mods # 4-hydroxy-glutamic-acid
418
+ if '[C@@H](C[C@H](C))' in content:
419
+ return 'LME', mods # (3r)-3-methyl-l-glutamic acid
420
+ if '[C@@H](CC[C@H](C))' in content:
421
+ return 'MEG', mods # (3s)-3-methyl-l-glutamic acid
422
+
423
+ # Sulfur and selenium modifications
424
+ if 'S' in content:
425
+ if 'SCC[C@@H]' in content:
426
+ return 'HSER', mods # homoserine
427
+ if 'SCCN' in content:
428
+ return 'SLZ', mods # thialysine
429
+ if 'SC(=O)' in content:
430
+ return 'CSA', mods # s-acetonylcysteine
431
+ if '[S@@](=O)' in content:
432
+ return 'SME', mods # Methionine sulfoxide
433
+ if 'S(=O)(=O)' in content:
434
+ return 'OMT', mods # Methionine sulfone
435
+
436
+ # Double bond containing
437
+ if 'C=' in content:
438
+ if 'C=C[C@@H]' in content:
439
+ return '2AG', mods # 2-Allyl-glycine
440
+ if 'C=C[C@@H]' in content:
441
+ return 'LVG', mods # vinylglycine
442
+ if 'C=Cc1ccccc1' in content:
443
+ return 'STYA', mods # Styrylalanine
444
+
445
+ # Special cases
446
+ if '[C@@H]1Cc2c(C1)cccc2' in content:
447
+ return 'IGL', mods # alpha-amino-2-indanacetic acid
448
+ if '[C](=[C](=O)=O)=O' in content:
449
+ return '26P', mods # 2-amino-6-oxopimelic acid
450
+ if '[C](=[C](=O)=O)=C' in content:
451
+ return '2NP', mods # l-2-amino-6-methylene-pimelic acid
452
+ if 'c2cnc[nH]2' in content:
453
+ return 'HIS', mods # histidine core
454
+ if 'c1cccc2c1cc(O)cc2' in content:
455
+ return 'NAO1', mods # 5-hydroxy-1-naphthalene
456
+ if 'c1ccc2c(c1)cc(O)cc2' in content:
457
+ return 'NAO2', mods # 6-hydroxy-2-naphthalene
458
+
459
+ # Proline (P) - flexible ring numbers
460
+ if any([
461
+ # Check for any ring number in bond patterns
462
+ (segment.get('bond_after', '').startswith(f'N{n}C(=O)') and 'CCC' in content and
463
+ any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
464
+ for n in '123456789'
465
+ ]) or any([
466
+ # Check ending patterns with any ring number
467
+ (f'CCCN{n}' in content and content.endswith('=O') and
468
+ any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
469
+ for n in '123456789'
470
+ ]) or any([
471
+ # Handle CCC[C@H]n patterns
472
+ (content == f'CCC[C@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
473
+ (content == f'CCC[C@@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
474
+ # N-terminal Pro with any ring number
475
+ (f'N{n}CCC[C@H]{n}' in content) or
476
+ (f'N{n}CCC[C@@H]{n}' in content)
477
+ for n in '123456789'
478
+ ]):
479
+ return 'Pro', mods
480
+
481
+ # Tryptophan (W) - more specific indole pattern
482
+ if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
483
+ 'c[nH]c' in content.replace(' ', ''):
484
+ return 'Trp', mods
485
+
486
+ # Lysine (K) - both patterns
487
+ if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
488
+ return 'Lys', mods
489
+
490
+ # Arginine (R) - both patterns
491
+ if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
492
+ return 'Arg', mods
493
+
494
+ if ('C[C@H](CCCC)' in content or 'C[C@@H](CCCC)' in content) and 'CC(C)' not in content:
495
+ return 'Nle', mods
496
+
497
+ # Ornithine (Orn) - 3-carbon chain with NH2
498
+ if ('C[C@H](CCCN)' in content or 'C[C@@H](CCCN)' in content) and 'CC(C)' not in content:
499
+ return 'Orn', mods
500
+
501
+ # 2-Naphthylalanine (2Nal) - distinct from Phe pattern
502
+ if ('Cc3cc2ccccc2c3' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
503
+ return '2Nal', mods
504
+
505
+ # Cyclohexylalanine (Cha) - already in your code but moved here for clarity
506
+ if 'N2CCCCC2' in content or 'CCCCC2' in content:
507
+ return 'Cha', mods
508
+
509
+ # Aminobutyric acid (Abu) - 2-carbon chain
510
+ if ('C[C@H](CC)' in content or 'C[C@@H](CC)' in content) and not any(p in content for p in ['CC(C)', 'CCCC', 'CCC(C)']):
511
+ return 'Abu', mods
512
+
513
+ # Pipecolic acid (Pip) - 6-membered ring like Pro
514
+ if ('N3CCCCC3' in content or 'CCCCC3' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
515
+ return 'Pip', mods
516
+
517
+ # Cyclohexylglycine (Chg) - direct cyclohexyl without CH2
518
+ if ('C[C@H](C1CCCCC1)' in content or 'C[C@@H](C1CCCCC1)' in content):
519
+ return 'Chg', mods
520
+
521
+ # 4-Fluorophenylalanine (4F-Phe)
522
+ if ('Cc2ccc(F)cc2' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
523
+ return '4F-Phe', mods
524
+
525
+ # Regular residue identification
526
+ if ('NCC(=O)' in content) or (content == 'C'):
527
+ # Middle case - between bonds
528
+ if segment.get('bond_before') and segment.get('bond_after'):
529
+ if ('C(=O)N' in segment['bond_before'] or 'C(=O)N(C)' in segment['bond_before']):
530
+ return 'Gly', mods
531
+ # Terminal case - at the end
532
+ elif segment.get('bond_before') and segment.get('bond_before').startswith('C(=O)N'):
533
+ return 'Gly', mods
534
+
535
+ if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content:
536
+ return 'Leu', mods
537
+ if '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content:
538
+ return 'Leu', mods
539
+
540
+ if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content:
541
+ return 'Thr', mods
542
+
543
+ if '[C@H](Cc2ccccc2)' in content or '[C@@H](Cc2ccccc2)' in content:
544
+ return 'Phe', mods
545
+
546
+ if ('[C@H](C(C)C)' in content or # With outer parentheses
547
+ '[C@@H](C(C)C)' in content or # With outer parentheses
548
+ '[C@H]C(C)C' in content or # Without outer parentheses
549
+ '[C@@H]C(C)C' in content): # Without outer parentheses
550
+ if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]']): # Still check not Leu
551
+ return 'Val', mods
552
+
553
+ if '[C@H](COC(C)(C)C)' in content or '[C@@H](COC(C)(C)C)' in content:
554
+ return 'O-tBu', mods
555
+
556
+ if any([
557
+ 'CC[C@H](C)' in content,
558
+ 'CC[C@@H](C)' in content,
559
+ 'C(C)C[C@H]' in content and 'CC(C)C' not in content,
560
+ 'C(C)C[C@@H]' in content and 'CC(C)C' not in content
561
+ ]):
562
+ return 'Ile', mods
563
+
564
+ if ('[C@H](C)' in content or '[C@@H](C)' in content):
565
+ if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
566
+ return 'Ala', mods
567
+
568
+ # Tyrosine (Tyr) - 4-hydroxybenzyl side chain
569
+ if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
570
+ return 'Tyr', mods
571
+
572
+
573
+ # Serine (Ser) - Hydroxymethyl side chain
574
+ if '[C@H](CO)' in content or '[C@@H](CO)' in content:
575
+ if not ('C(C)O' in content or 'COC' in content):
576
+ return 'Ser', mods
577
+
578
+ # Threonine (Thr) - 1-hydroxyethyl side chain
579
+ if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content or '[C@@H](C)O' in content or '[C@H](C)O' in content:
580
+ return 'Thr', mods
581
+
582
+ # Cysteine (Cys) - Thiol side chain
583
+ if '[C@H](CS)' in content or '[C@@H](CS)' in content:
584
+ return 'Cys', mods
585
+
586
+ # Methionine (Met) - Methylthioethyl side chain
587
+ if ('C[C@H](CCSC)' in content or 'C[C@@H](CCSC)' in content):
588
+ return 'Met', mods
589
+
590
+ # Asparagine (Asn) - Carbamoylmethyl side chain
591
+ if ('CC(=O)N' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
592
+ return 'Asn', mods
593
+
594
+ # Glutamine (Gln) - Carbamoylethyl side chain
595
+ if ('CCC(=O)N' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
596
+ return 'Gln', mods
597
+
598
+ # Aspartic acid (Asp) - Carboxymethyl side chain
599
+ if ('CC(=O)O' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
600
+ return 'Asp', mods
601
+
602
+ # Glutamic acid (Glu) - Carboxyethyl side chain
603
+ if ('CCC(=O)O' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
604
+ return 'Glu', mods
605
+
606
+ # Arginine (Arg) - 3-guanidinopropyl side chain
607
+ if ('CCCNC(=N)N' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
608
+ return 'Arg', mods
609
+
610
+ # Histidine (His) - Imidazole side chain
611
+ if ('Cc2cnc[nH]2' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
612
+ return 'His', mods
613
+
614
+ return None, mods
615
+
616
+ def get_modifications(self, segment):
617
+ """Get modifications based on bond types"""
618
+ mods = []
619
+ if segment.get('bond_after'):
620
+ if 'N(C)' in segment['bond_after'] or segment['bond_after'].startswith('C(=O)N(C)'):
621
+ mods.append('N-Me')
622
+ if 'OC(=O)' in segment['bond_after']:
623
+ mods.append('O-linked')
624
+ return mods
625
+
626
+ def analyze_structure(self, smiles):
627
+ """Main analysis function with debug output"""
628
+ print("\nAnalyzing structure:", smiles)
629
+
630
+ # Split into segments
631
+ segments = self.split_on_bonds(smiles)
632
+
633
+ print("\nSegment Analysis:")
634
+ sequence = []
635
+ for i, segment in enumerate(segments):
636
+ print(f"\nSegment {i}:")
637
+ print(f"Content: {segment['content']}")
638
+ print(f"Bond before: {segment.get('bond_before', 'None')}")
639
+ print(f"Bond after: {segment.get('bond_after', 'None')}")
640
+
641
+ residue, mods = self.identify_residue(segment)
642
+ if residue:
643
+ if mods:
644
+ sequence.append(f"{residue}({','.join(mods)})")
645
+ else:
646
+ sequence.append(residue)
647
+ print(f"Identified as: {residue}")
648
+ print(f"Modifications: {mods}")
649
+ else:
650
+ print(f"Warning: Could not identify residue in segment: {segment['content']}")
651
+
652
+ # Check if cyclic
653
+ is_cyclic, peptide_cycles, aromatic_cycles = self.is_cyclic(smiles)
654
+ three_letter = '-'.join(sequence)
655
+ one_letter = ''.join(self.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence)
656
+
657
+ if is_cyclic:
658
+ three_letter = f"cyclo({three_letter})"
659
+ one_letter = f"cyclo({one_letter})"
660
+
661
+ print(f"\nFinal sequence: {three_letter}")
662
+ print(f"One-letter code: {one_letter}")
663
+ print(f"Is cyclic: {is_cyclic}")
664
+ #print(f"Peptide cycles: {peptide_cycles}")
665
+ #print(f"Aromatic cycles: {aromatic_cycles}")
666
+
667
+ return three_letter, len(segments)
668
+ """return {
669
+ 'three_letter': three_letter,
670
+ #'one_letter': one_letter,
671
+ 'is_cyclic': is_cyclic
672
+ }"""
673
+
674
+ def return_sequence(self, smiles):
675
+ """Main analysis function with debug output"""
676
+ print("\nAnalyzing structure:", smiles)
677
+
678
+ # Split into segments
679
+ segments = self.split_on_bonds(smiles)
680
+
681
+ print("\nSegment Analysis:")
682
+ sequence = []
683
+ for i, segment in enumerate(segments):
684
+ print(f"\nSegment {i}:")
685
+ print(f"Content: {segment['content']}")
686
+ print(f"Bond before: {segment.get('bond_before', 'None')}")
687
+ print(f"Bond after: {segment.get('bond_after', 'None')}")
688
+
689
+ residue, mods = self.identify_residue(segment)
690
+ if residue:
691
+ if mods:
692
+ sequence.append(f"{residue}({','.join(mods)})")
693
+ else:
694
+ sequence.append(residue)
695
+ print(f"Identified as: {residue}")
696
+ print(f"Modifications: {mods}")
697
+ else:
698
+ print(f"Warning: Could not identify residue in segment: {segment['content']}")
699
+
700
+ return sequence
701
+
702
+ """
703
+ def annotate_cyclic_structure(mol, sequence):
704
+ '''Create annotated 2D structure with clear, non-overlapping residue labels'''
705
+ # Generate 2D coordinates
706
+ # Generate 2D coordinates
707
+ AllChem.Compute2DCoords(mol)
708
+
709
+ # Create drawer with larger size for annotations
710
+ drawer = Draw.rdMolDraw2D.MolDraw2DCairo(2000, 2000) # Even larger size
711
+
712
+ # Get residue list and reverse it to match structural representation
713
+ if sequence.startswith('cyclo('):
714
+ residues = sequence[6:-1].split('-')
715
+ else:
716
+ residues = sequence.split('-')
717
+ residues = list(reversed(residues)) # Reverse the sequence
718
+
719
+ # Draw molecule first to get its bounds
720
+ drawer.drawOptions().addAtomIndices = False
721
+ drawer.DrawMolecule(mol)
722
+ drawer.FinishDrawing()
723
+
724
+ # Convert to PIL Image
725
+ img = Image.open(BytesIO(drawer.GetDrawingText()))
726
+ draw = ImageDraw.Draw(img)
727
+
728
+ try:
729
+ # Try to use DejaVuSans as it's commonly available on Linux systems
730
+ font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 60)
731
+ small_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 60)
732
+ except OSError:
733
+ try:
734
+ # Fallback to Arial if available (common on Windows)
735
+ font = ImageFont.truetype("arial.ttf", 60)
736
+ small_font = ImageFont.truetype("arial.ttf", 60)
737
+ except OSError:
738
+ # If no TrueType fonts are available, fall back to default
739
+ print("Warning: TrueType fonts not available, using default font")
740
+ font = ImageFont.load_default()
741
+ small_font = ImageFont.load_default()
742
+ # Get molecule bounds
743
+ conf = mol.GetConformer()
744
+ positions = []
745
+ for i in range(mol.GetNumAtoms()):
746
+ pos = conf.GetAtomPosition(i)
747
+ positions.append((pos.x, pos.y))
748
+
749
+ x_coords = [p[0] for p in positions]
750
+ y_coords = [p[1] for p in positions]
751
+ min_x, max_x = min(x_coords), max(x_coords)
752
+ min_y, max_y = min(y_coords), max(y_coords)
753
+
754
+ # Calculate scaling factors
755
+ scale = 150 # Increased scale factor
756
+ center_x = 1000 # Image center
757
+ center_y = 1000
758
+
759
+ # Add residue labels in a circular arrangement around the structure
760
+ n_residues = len(residues)
761
+ radius = 700 # Distance of labels from center
762
+
763
+ # Start from the rightmost point (3 o'clock position) and go counterclockwise
764
+ # Offset by -3 positions to align with structure
765
+ offset = 0 # Adjust this value to match the structure alignment
766
+ for i, residue in enumerate(residues):
767
+ # Calculate position in a circle around the structure
768
+ # Start from 0 (3 o'clock) and go counterclockwise
769
+ angle = -(2 * np.pi * ((i + offset) % n_residues) / n_residues)
770
+
771
+ # Calculate label position
772
+ label_x = center_x + radius * np.cos(angle)
773
+ label_y = center_y + radius * np.sin(angle)
774
+
775
+ # Draw residue label
776
+ text = f"{i+1}. {residue}"
777
+ bbox = draw.textbbox((label_x, label_y), text, font=font)
778
+ padding = 10
779
+ draw.rectangle([bbox[0]-padding, bbox[1]-padding,
780
+ bbox[2]+padding, bbox[3]+padding],
781
+ fill='white', outline='white')
782
+ draw.text((label_x, label_y), text,
783
+ font=font, fill='black', anchor="mm")
784
+
785
+ # Add sequence at the top with white background
786
+ seq_text = f"Sequence: {sequence}"
787
+ bbox = draw.textbbox((center_x, 100), seq_text, font=small_font)
788
+ padding = 10
789
+ draw.rectangle([bbox[0]-padding, bbox[1]-padding,
790
+ bbox[2]+padding, bbox[3]+padding],
791
+ fill='white', outline='white')
792
+ draw.text((center_x, 100), seq_text,
793
+ font=small_font, fill='black', anchor="mm")
794
+
795
+ return img
796
+
797
+ """
798
+ def annotate_cyclic_structure(mol, sequence):
799
+ """Create structure visualization with just the sequence header"""
800
+ # Generate 2D coordinates
801
+ AllChem.Compute2DCoords(mol)
802
+
803
+ # Create drawer with larger size for annotations
804
+ drawer = Draw.rdMolDraw2D.MolDraw2DCairo(2000, 2000)
805
+
806
+ # Draw molecule first
807
+ drawer.drawOptions().addAtomIndices = False
808
+ drawer.DrawMolecule(mol)
809
+ drawer.FinishDrawing()
810
+
811
+ # Convert to PIL Image
812
+ img = Image.open(BytesIO(drawer.GetDrawingText()))
813
+ draw = ImageDraw.Draw(img)
814
+ try:
815
+ small_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 60)
816
+ except OSError:
817
+ try:
818
+ small_font = ImageFont.truetype("arial.ttf", 60)
819
+ except OSError:
820
+ print("Warning: TrueType fonts not available, using default font")
821
+ small_font = ImageFont.load_default()
822
+
823
+ # Add just the sequence header at the top
824
+ seq_text = f"Sequence: {sequence}"
825
+ bbox = draw.textbbox((1000, 100), seq_text, font=small_font)
826
+ padding = 10
827
+ draw.rectangle([bbox[0]-padding, bbox[1]-padding,
828
+ bbox[2]+padding, bbox[3]+padding],
829
+ fill='white', outline='white')
830
+ draw.text((1000, 100), seq_text,
831
+ font=small_font, fill='black', anchor="mm")
832
+
833
+ return img
834
+
835
+ def create_enhanced_linear_viz(sequence, smiles):
836
+ """Create an enhanced linear representation using PeptideAnalyzer"""
837
+ analyzer = PeptideAnalyzer() # Create analyzer instance
838
+
839
+ # Create figure with two subplots
840
+ fig = plt.figure(figsize=(15, 10))
841
+ gs = fig.add_gridspec(2, 1, height_ratios=[1, 2])
842
+ ax_struct = fig.add_subplot(gs[0])
843
+ ax_detail = fig.add_subplot(gs[1])
844
+
845
+ # Parse sequence and get residues
846
+ if sequence.startswith('cyclo('):
847
+ residues = sequence[6:-1].split('-')
848
+ else:
849
+ residues = sequence.split('-')
850
+
851
+ # Get segments using analyzer
852
+ segments = analyzer.split_on_bonds(smiles)
853
+
854
+ # Debug print
855
+ print(f"Number of residues: {len(residues)}")
856
+ print(f"Number of segments: {len(segments)}")
857
+
858
+ # Top subplot - Basic structure
859
+ ax_struct.set_xlim(0, 10)
860
+ ax_struct.set_ylim(0, 2)
861
+
862
+ num_residues = len(residues)
863
+ spacing = 9.0 / (num_residues - 1) if num_residues > 1 else 9.0
864
+
865
+ # Draw basic structure
866
+ y_pos = 1.5
867
+ for i in range(num_residues):
868
+ x_pos = 0.5 + i * spacing
869
+
870
+ # Draw amino acid box
871
+ rect = patches.Rectangle((x_pos-0.3, y_pos-0.2), 0.6, 0.4,
872
+ facecolor='lightblue', edgecolor='black')
873
+ ax_struct.add_patch(rect)
874
+
875
+ # Draw connecting bonds if not the last residue
876
+ if i < num_residues - 1:
877
+ segment = segments[i] if i < len(segments) else None
878
+ if segment:
879
+ # Determine bond type from segment info
880
+ bond_type = 'ester' if 'O-linked' in segment.get('bond_after', '') else 'peptide'
881
+ is_n_methylated = 'N-Me' in segment.get('bond_after', '')
882
+
883
+ bond_color = 'red' if bond_type == 'ester' else 'black'
884
+ linestyle = '--' if bond_type == 'ester' else '-'
885
+
886
+ # Draw bond line
887
+ ax_struct.plot([x_pos+0.3, x_pos+spacing-0.3], [y_pos, y_pos],
888
+ color=bond_color, linestyle=linestyle, linewidth=2)
889
+
890
+ # Add bond type label
891
+ mid_x = x_pos + spacing/2
892
+ bond_label = f"{bond_type}"
893
+ if is_n_methylated:
894
+ bond_label += "\n(N-Me)"
895
+ ax_struct.text(mid_x, y_pos+0.1, bond_label,
896
+ ha='center', va='bottom', fontsize=10,
897
+ color=bond_color)
898
+
899
+ # Add residue label
900
+ ax_struct.text(x_pos, y_pos-0.5, residues[i],
901
+ ha='center', va='top', fontsize=14)
902
+
903
+ # Bottom subplot - Detailed breakdown
904
+ ax_detail.set_ylim(0, len(segments)+1)
905
+ ax_detail.set_xlim(0, 1)
906
+
907
+ # Create detailed breakdown
908
+ segment_y = len(segments) # Start from top
909
+ for i, segment in enumerate(segments):
910
+ y = segment_y - i
911
+
912
+ # Check if this is a bond or residue
913
+ residue, mods = analyzer.identify_residue(segment)
914
+ if residue:
915
+ text = f"Residue {i+1}: {residue}"
916
+ if mods:
917
+ text += f" ({', '.join(mods)})"
918
+ color = 'blue'
919
+ else:
920
+ # Must be a bond
921
+ text = f"Bond {i}: "
922
+ if 'O-linked' in segment.get('bond_after', ''):
923
+ text += "ester"
924
+ elif 'N-Me' in segment.get('bond_after', ''):
925
+ text += "peptide (N-methylated)"
926
+ else:
927
+ text += "peptide"
928
+ color = 'red'
929
+
930
+ # Add segment analysis
931
+ ax_detail.text(0.05, y, text, fontsize=12, color=color)
932
+ ax_detail.text(0.5, y, f"SMILES: {segment.get('content', '')}", fontsize=10, color='gray')
933
+
934
+ # If cyclic, add connection indicator
935
+ if sequence.startswith('cyclo('):
936
+ ax_struct.annotate('', xy=(9.5, y_pos), xytext=(0.5, y_pos),
937
+ arrowprops=dict(arrowstyle='<->', color='red', lw=2))
938
+ ax_struct.text(5, y_pos+0.3, 'Cyclic Connection',
939
+ ha='center', color='red', fontsize=14)
940
+
941
+ # Add titles and adjust layout
942
+ ax_struct.set_title("Peptide Structure Overview", pad=20)
943
+ ax_detail.set_title("Segment Analysis Breakdown", pad=20)
944
+
945
+ # Remove axes
946
+ for ax in [ax_struct, ax_detail]:
947
+ ax.set_xticks([])
948
+ ax.set_yticks([])
949
+ ax.axis('off')
950
+
951
+ plt.tight_layout()
952
+ return fig
953
+
954
+ class PeptideStructureGenerator:
955
+ """A class to generate 3D structures of peptides using different embedding methods"""
956
+
957
+ @staticmethod
958
+ def prepare_molecule(smiles):
959
+ """Prepare molecule with proper hydrogen handling"""
960
+ mol = Chem.MolFromSmiles(smiles, sanitize=False)
961
+ if mol is None:
962
+ raise ValueError("Failed to create molecule from SMILES")
963
+
964
+ # Calculate valence for each atom
965
+ for atom in mol.GetAtoms():
966
+ atom.UpdatePropertyCache(strict=False)
967
+
968
+ # Sanitize with reduced requirements
969
+ Chem.SanitizeMol(mol,
970
+ sanitizeOps=Chem.SANITIZE_FINDRADICALS|
971
+ Chem.SANITIZE_KEKULIZE|
972
+ Chem.SANITIZE_SETAROMATICITY|
973
+ Chem.SANITIZE_SETCONJUGATION|
974
+ Chem.SANITIZE_SETHYBRIDIZATION|
975
+ Chem.SANITIZE_CLEANUPCHIRALITY)
976
+
977
+ mol = Chem.AddHs(mol)
978
+ return mol
979
+
980
+ @staticmethod
981
+ def get_etkdg_params(attempt=0):
982
+ """Get ETKDG parameters with optional modifications based on attempt number"""
983
+ params = AllChem.ETKDGv3()
984
+ params.randomSeed = -1
985
+ params.maxIterations = 200
986
+ params.numThreads = 4 # Reduced for web interface
987
+ params.useBasicKnowledge = True
988
+ params.enforceChirality = True
989
+ params.useExpTorsionAnglePrefs = True
990
+ params.useSmallRingTorsions = True
991
+ params.useMacrocycleTorsions = True
992
+ params.ETversion = 2
993
+ params.pruneRmsThresh = -1
994
+ params.embedRmsThresh = 0.5
995
+
996
+ if attempt > 10:
997
+ params.bondLength = 1.5 + (attempt - 10) * 0.02
998
+ params.useExpTorsionAnglePrefs = False
999
+
1000
+ return params
1001
+
1002
+ def generate_structure_etkdg(self, smiles, max_attempts=20):
1003
+ """Generate 3D structure using ETKDG without UFF optimization"""
1004
+ success = False
1005
+ mol = None
1006
+
1007
+ for attempt in range(max_attempts):
1008
+ try:
1009
+ mol = self.prepare_molecule(smiles)
1010
+ params = self.get_etkdg_params(attempt)
1011
+
1012
+ if AllChem.EmbedMolecule(mol, params) == 0:
1013
+ success = True
1014
+ break
1015
+ except Exception as e:
1016
+ continue
1017
+
1018
+ if not success:
1019
+ raise ValueError("Failed to generate structure with ETKDG")
1020
+
1021
+ return mol
1022
+
1023
+ def generate_structure_uff(self, smiles, max_attempts=20):
1024
+ """Generate 3D structure using ETKDG followed by UFF optimization"""
1025
+ best_mol = None
1026
+ lowest_energy = float('inf')
1027
+
1028
+ for attempt in range(max_attempts):
1029
+ try:
1030
+ test_mol = self.prepare_molecule(smiles)
1031
+ params = self.get_etkdg_params(attempt)
1032
+
1033
+ if AllChem.EmbedMolecule(test_mol, params) == 0:
1034
+ res = AllChem.UFFOptimizeMolecule(test_mol, maxIters=2000,
1035
+ vdwThresh=10.0, confId=0,
1036
+ ignoreInterfragInteractions=True)
1037
+
1038
+ if res == 0:
1039
+ ff = AllChem.UFFGetMoleculeForceField(test_mol)
1040
+ if ff:
1041
+ current_energy = ff.CalcEnergy()
1042
+ if current_energy < lowest_energy:
1043
+ lowest_energy = current_energy
1044
+ best_mol = Chem.Mol(test_mol)
1045
+ except Exception:
1046
+ continue
1047
+
1048
+ if best_mol is None:
1049
+ raise ValueError("Failed to generate optimized structure")
1050
+
1051
+ return best_mol
1052
+
1053
+ @staticmethod
1054
+ def mol_to_sdf_bytes(mol):
1055
+ """Convert RDKit molecule to SDF file bytes"""
1056
+ # First write to StringIO in text mode
1057
+ sio = StringIO()
1058
+ writer = Chem.SDWriter(sio)
1059
+ writer.write(mol)
1060
+ writer.close()
1061
+
1062
+ # Convert the string to bytes
1063
+ return sio.getvalue().encode('utf-8')
1064
+
1065
+ def process_input(smiles_input=None, file_obj=None, show_linear=False,
1066
+ show_segment_details=False, generate_3d=False, use_uff=False):
1067
+ """Process input and create visualizations using PeptideAnalyzer"""
1068
+ analyzer = PeptideAnalyzer()
1069
+ temp_dir = tempfile.mkdtemp() if generate_3d else None
1070
+ structure_files = []
1071
+
1072
+ # Handle direct SMILES input
1073
+ if smiles_input:
1074
+ smiles = smiles_input.strip()
1075
+
1076
+ # First check if it's a peptide using analyzer's method
1077
+ if not analyzer.is_peptide(smiles):
1078
+ return "Error: Input SMILES does not appear to be a peptide structure.", None, None
1079
+
1080
+ try:
1081
+ # Create molecule
1082
+ mol = Chem.MolFromSmiles(smiles)
1083
+ if mol is None:
1084
+ return "Error: Invalid SMILES notation.", None, None
1085
+
1086
+ # Generate 3D structures if requested
1087
+ if generate_3d:
1088
+ generator = PeptideStructureGenerator()
1089
+
1090
+ try:
1091
+ # Generate ETKDG structure
1092
+ mol_etkdg = generator.generate_structure_etkdg(smiles)
1093
+ etkdg_path = os.path.join(temp_dir, "structure_etkdg.sdf")
1094
+ writer = Chem.SDWriter(etkdg_path)
1095
+ writer.write(mol_etkdg)
1096
+ writer.close()
1097
+ structure_files.append(etkdg_path)
1098
+
1099
+ # Generate UFF structure if requested
1100
+ if use_uff:
1101
+ mol_uff = generator.generate_structure_uff(smiles)
1102
+ uff_path = os.path.join(temp_dir, "structure_uff.sdf")
1103
+ writer = Chem.SDWriter(uff_path)
1104
+ writer.write(mol_uff)
1105
+ writer.close()
1106
+ structure_files.append(uff_path)
1107
+
1108
+ except Exception as e:
1109
+ return f"Error generating 3D structures: {str(e)}", None, None, None
1110
+
1111
+ # Use analyzer to get sequence
1112
+ segments = analyzer.split_on_bonds(smiles)
1113
+
1114
+ # Process segments and build sequence
1115
+ sequence_parts = []
1116
+ output_text = ""
1117
+
1118
+ # Only include segment analysis in output if requested
1119
+ if show_segment_details:
1120
+ output_text += "Segment Analysis:\n"
1121
+ for i, segment in enumerate(segments):
1122
+ output_text += f"\nSegment {i}:\n"
1123
+ output_text += f"Content: {segment['content']}\n"
1124
+ output_text += f"Bond before: {segment.get('bond_before', 'None')}\n"
1125
+ output_text += f"Bond after: {segment.get('bond_after', 'None')}\n"
1126
+
1127
+ residue, mods = analyzer.identify_residue(segment)
1128
+ if residue:
1129
+ if mods:
1130
+ sequence_parts.append(f"{residue}({','.join(mods)})")
1131
+ else:
1132
+ sequence_parts.append(residue)
1133
+ output_text += f"Identified as: {residue}\n"
1134
+ output_text += f"Modifications: {mods}\n"
1135
+ else:
1136
+ output_text += f"Warning: Could not identify residue in segment: {segment['content']}\n"
1137
+ output_text += "\n"
1138
+ else:
1139
+ # Just build sequence without detailed analysis in output
1140
+ for segment in segments:
1141
+ residue, mods = analyzer.identify_residue(segment)
1142
+ if residue:
1143
+ if mods:
1144
+ sequence_parts.append(f"{residue}({','.join(mods)})")
1145
+ else:
1146
+ sequence_parts.append(residue)
1147
+
1148
+ # Check if cyclic using analyzer's method
1149
+ is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
1150
+ three_letter = '-'.join(sequence_parts)
1151
+ one_letter = ''.join(analyzer.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence_parts)
1152
+
1153
+ if is_cyclic:
1154
+ three_letter = f"cyclo({three_letter})"
1155
+ one_letter = f"cyclo({one_letter})"
1156
+
1157
+ # Create cyclic structure visualization
1158
+ img_cyclic = annotate_cyclic_structure(mol, three_letter)
1159
+
1160
+ # Create linear representation if requested
1161
+ img_linear = None
1162
+ if show_linear:
1163
+ fig_linear = create_enhanced_linear_viz(three_letter, smiles)
1164
+ buf = BytesIO()
1165
+ fig_linear.savefig(buf, format='png', bbox_inches='tight', dpi=300)
1166
+ buf.seek(0)
1167
+ img_linear = Image.open(buf)
1168
+ plt.close(fig_linear)
1169
+
1170
+ # Add summary to output
1171
+ summary = "Summary:\n"
1172
+ summary += f"Sequence: {three_letter}\n"
1173
+ summary += f"One-letter code: {one_letter}\n"
1174
+ summary += f"Is Cyclic: {'Yes' if is_cyclic else 'No'}\n"
1175
+ #if is_cyclic:
1176
+ #summary += f"Peptide Cycles: {', '.join(peptide_cycles)}\n"
1177
+ #summary += f"Aromatic Cycles: {', '.join(aromatic_cycles)}\n"
1178
+
1179
+ if structure_files:
1180
+ summary += "\n3D Structures Generated:\n"
1181
+ for filepath in structure_files:
1182
+ summary += f"- {os.path.basename(filepath)}\n"
1183
+
1184
+ return summary + output_text, img_cyclic, img_linear, structure_files if structure_files else None
1185
+
1186
+ except Exception as e:
1187
+ return f"Error processing SMILES: {str(e)}", None, None, None
1188
+
1189
+ # Handle file input
1190
+ if file_obj is not None:
1191
+ try:
1192
+ # Handle file content
1193
+ if hasattr(file_obj, 'name'):
1194
+ with open(file_obj.name, 'r') as f:
1195
+ content = f.read()
1196
+ else:
1197
+ content = file_obj.decode('utf-8') if isinstance(file_obj, bytes) else str(file_obj)
1198
+
1199
+ output_text = ""
1200
+ for line in content.splitlines():
1201
+ smiles = line.strip()
1202
+ if smiles:
1203
+ # Check if it's a peptide
1204
+ if not analyzer.is_peptide(smiles):
1205
+ output_text += f"Skipping non-peptide SMILES: {smiles}\n"
1206
+ continue
1207
+
1208
+ # Process this SMILES
1209
+ segments = analyzer.split_on_bonds(smiles)
1210
+ sequence_parts = []
1211
+
1212
+ # Add segment details if requested
1213
+ if show_segment_details:
1214
+ output_text += f"\nSegment Analysis for SMILES: {smiles}\n"
1215
+ for i, segment in enumerate(segments):
1216
+ output_text += f"\nSegment {i}:\n"
1217
+ output_text += f"Content: {segment['content']}\n"
1218
+ output_text += f"Bond before: {segment.get('bond_before', 'None')}\n"
1219
+ output_text += f"Bond after: {segment.get('bond_after', 'None')}\n"
1220
+ residue, mods = analyzer.identify_residue(segment)
1221
+ if residue:
1222
+ if mods:
1223
+ sequence_parts.append(f"{residue}({','.join(mods)})")
1224
+ else:
1225
+ sequence_parts.append(residue)
1226
+ output_text += f"Identified as: {residue}\n"
1227
+ output_text += f"Modifications: {mods}\n"
1228
+ else:
1229
+ for segment in segments:
1230
+ residue, mods = analyzer.identify_residue(segment)
1231
+ if residue:
1232
+ if mods:
1233
+ sequence_parts.append(f"{residue}({','.join(mods)})")
1234
+ else:
1235
+ sequence_parts.append(residue)
1236
+
1237
+ # Get cyclicity and create sequence
1238
+ is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
1239
+ sequence = f"cyclo({'-'.join(sequence_parts)})" if is_cyclic else '-'.join(sequence_parts)
1240
+
1241
+ output_text += f"\nSummary for SMILES: {smiles}\n"
1242
+ output_text += f"Sequence: {sequence}\n"
1243
+ output_text += f"Is Cyclic: {'Yes' if is_cyclic else 'No'}\n"
1244
+ if is_cyclic:
1245
+ output_text += f"Peptide Cycles: {', '.join(peptide_cycles)}\n"
1246
+ #output_text += f"Aromatic Cycles: {', '.join(aromatic_cycles)}\n"
1247
+ output_text += "-" * 50 + "\n"
1248
+
1249
+ return output_text, None, None
1250
+
1251
+ except Exception as e:
1252
+ return f"Error processing file: {str(e)}", None, None
1253
+
1254
+ return "No input provided.", None, None
1255
+