<?xml version="1.0" encoding="UTF-8"?>
<doi_batch version="4.3.0" xmlns="http://www.crossref.org/doi_resources_schema/4.3.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.crossref.org/doi_resources_schema/4.3.0 http://www.crossref.org/schema/deposit/doi_resources4.3.0.xsd">
<head>
<doi_batch_id>f45588c7-713e-4cb9-8781-33edf8e7758c</doi_batch_id>
<depositor>
<name>beie</name>
<email_address>director@blueeyesintelligence.org</email_address>
</depositor>
</head>
<body>
<doi_citations>
<doi>10.35940/ijrte.C6439.0910321</doi>
<citation_list><citation key="ref0"><unstructured_citation>Mao, J., Xu, W., Yang, Y., Wang, J., Huang, Z., &amp; Yuille, A. (2014). Deep captioning with multimodal recurrent neural networks (m-rnn). arXiv preprint arXiv:1412.6632.</unstructured_citation></citation><citation key="ref1"><journal_title>In Proceedings of the IEEE conference on computer vision and pattern recognition (pp</journal_title><author>Karpathy</author><cYear>2015</cYear><doi>10.1109/cvpr.2015.7298932</doi><article_title>Deep visual-semantic alignments for generating image descriptions</article_title><unstructured_citation>Karpathy, A., &amp; Fei-Fei, L. (2015). Deep visual-semantic alignments for generating image descriptions. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 3128-3137).</unstructured_citation></citation><citation key="ref2"><unstructured_citation>Karpathy, A., Joulin, A., &amp; Fei-Fei, L. (2014). Deep fragment embeddings for bidirectional image sentence mapping. arXiv preprint arXiv:1406.5679.</unstructured_citation></citation><citation key="ref3"><journal_title>In Proceedings of the IEEE conference on computer vision and pattern recognition (pp</journal_title><author>Vinyals</author><cYear>2015</cYear><doi>10.1109/cvpr.2015.7298935</doi><article_title>Show and tell: A neural image caption generator</article_title><unstructured_citation>Vinyals, O., Toshev, A., Bengio, S., &amp; Erhan, D. (2015). Show and tell: A neural image caption generator. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 3156-3164).</unstructured_citation></citation><citation key="ref4"><journal_title>In Proceedings of the IEEE conference on computer vision and pattern recognition (pp</journal_title><author>Anderson</author><cYear>2018</cYear><doi>10.1109/cvpr.2018.00636</doi><article_title>Bottom-up and top-down attention for image captioning and visual question answering</article_title><unstructured_citation>Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., &amp; Zhang, L. (2018). Bottom-up and top-down attention for image captioning and visual question answering. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 6077-6086).</unstructured_citation></citation><citation key="ref5"><journal_title>In Proceedings of the IEEE/CVF International Conference on Computer Vision (pp</journal_title><author>Huang</author><cYear>2019</cYear><doi>10.1109/iccv.2019.00473</doi><article_title>Attention on attention for image captioning</article_title><unstructured_citation>Huang, L., Wang, W., Chen, J., &amp; Wei, X. Y. (2019). Attention on attention for image captioning. In Proceedings of the IEEE/CVF International Conference on Computer Vision (pp. 4634-4643).</unstructured_citation></citation><citation key="ref6"><unstructured_citation>Perez, L., &amp; Wang, J. (2017). The effectiveness of data augmentation in image classification using deep learning. arXiv preprint arXiv:1712.04621.</unstructured_citation></citation><citation key="ref7"><journal_title>Journal of Big Data</journal_title><author>Shorten</author><volume>6</volume><issue>1</issue><first_page>1</first_page><cYear>2019</cYear><doi>10.1186/s40537-019-0197-0</doi><article_title>A survey on image data augmentation for deep learning</article_title><unstructured_citation>Shorten, C., &amp; Khoshgoftaar, T. M. (2019). A survey on image data augmentation for deep learning. Journal of Big Data, 6(1), 1-48.</unstructured_citation></citation><citation key="ref8"><unstructured_citation>Inoue, H. (2018). Data augmentation by pairing samples for images classification. arXiv preprint arXiv:1801.02929.</unstructured_citation></citation><citation key="ref9"><unstructured_citation>Bujimalla, S., Subedar, M., &amp; Tickoo, O. (2021). Data augmentation to improve robustness of image captioning solutions. arXiv preprint arXiv:2106.05437.</unstructured_citation></citation><citation key="ref10"><doi>10.14569/IJACSA.2019.0101074</doi><unstructured_citation>Aldabbas, H., Asad, M., Ryalat, M. H., Malik, K. R., &amp; Qureshi, M. Z. A. (2019). Data Augmentation to Stabilize Image Caption Generation Models in Deep Learning.</unstructured_citation></citation><citation key="ref11"><unstructured_citation>Mitchell, M., Dodge, J., Goyal, A., Yamaguchi, K., Stratos, K., Han, X., … &amp; Daumé III, H. (2012, April). Midge: Generating image descriptions from computer vision detections. In Proceedings of the 13th Conference of the European Chapter of the Association for Computational Linguistics (pp. 747-756).</unstructured_citation></citation><citation key="ref12"><unstructured_citation>Yang, Y., Teo, C., Daumé III, H., &amp; Aloimonos, Y. (2011, July). Corpus-guided sentence generation of natural images. In Proceedings of the 2011 Conference on Empirical Methods in Natural Language Processing (pp. 444-454).</unstructured_citation></citation><citation key="ref13"><unstructured_citation>Kuznetsova, P., Ordonez, V., Berg, A., Berg, T., &amp; Choi, Y. (2012, July). Collective generation of natural image descriptions. In Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) (pp. 359-368).</unstructured_citation></citation><citation key="ref14"><doi>10.1007/978-3-642-15561-1_2</doi><unstructured_citation>Farhadi, A., Hejrati, M., Sadeghi, M. A., Young, P., Rashtchian, C., Hockenmaier, J., &amp; Forsyth, D. (2010, September). Every picture tells a story: Generating sentences from images. In European conference on computer vision (pp. 15-29). Springer, Berlin, Heidelberg.</unstructured_citation></citation><citation key="ref15"><doi>10.3115/v1/P15-2017</doi><unstructured_citation>Devlin, J., Cheng, H., Fang, H., Gupta, S., Deng, L., He, X., … &amp; Mitchell, M. (2015). Language models for image captioning: The quirks and what works. arXiv preprint arXiv:1505.01809.</unstructured_citation></citation><citation key="ref16"><journal_title>In Proceedings of the IEEE conference on computer vision and pattern recognition (pp</journal_title><author>You</author><cYear>2016</cYear><doi>10.1109/cvpr.2016.503</doi><article_title>Image captioning with semantic attention</article_title><unstructured_citation>You, Q., Jin, H., Wang, Z., Fang, C., &amp; Luo, J. (2016). Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 4651-4659).</unstructured_citation></citation><citation key="ref17"><journal_title>In Proceedings of the IEEE conference on computer vision and pattern recognition (pp</journal_title><author>Yao</author><cYear>2017</cYear><doi>10.1109/cvpr.2017.559</doi><article_title>Incorporating copying mechanism in image captioning for learning novel objects</article_title><unstructured_citation>Yao, T., Pan, Y., Li, Y., &amp; Mei, T. (2017). Incorporating copying mechanism in image captioning for learning novel objects. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 6580-6588)</unstructured_citation></citation><citation key="ref18"><journal_title>IEEE transactions on pattern analysis and machine intelligence</journal_title><author>Kulkarni</author><volume>35</volume><issue>12</issue><first_page>2891</first_page><cYear>2013</cYear><doi>10.1109/TPAMI.2012.162</doi><article_title>Babytalk: Understanding and generating simple image descriptions</article_title><unstructured_citation>Kulkarni, G., Premraj, V., Ordonez, V., Dhar, S., Li, S., Choi, Y., … &amp; Berg, T. L. (2013). Babytalk: Understanding and generating simple image descriptions. IEEE transactions on pattern analysis and machine intelligence, 35(12), 2891-2903.</unstructured_citation></citation><citation key="ref19"><journal_title>In Proceedings of the IEEE conference on computer vision and pattern recognition (pp</journal_title><author>Fang</author><cYear>2015</cYear><doi>10.1109/cvpr.2015.7298754</doi><article_title>From captions to visual concepts and back</article_title><unstructured_citation>Fang, H., Gupta, S., Iandola, F., Srivastava, R. K., Deng, L., Dollár, P., … &amp; Zweig, G. (2015). From captions to visual concepts and back. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1473-1482).</unstructured_citation></citation><citation key="ref20"><journal_title>In Proceedings of the IEEE conference on computer vision and pattern recognition (pp</journal_title><author>Vinyals</author><cYear>2015</cYear><doi>10.1109/cvpr.2015.7298935</doi><article_title>Show and tell: A neural image caption generator</article_title><unstructured_citation>Vinyals, O., Toshev, A., Bengio, S., &amp; Erhan, D. (2015). Show and tell: A neural image caption generator. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 3156-3164).</unstructured_citation></citation><citation key="ref21"><journal_title>Wireless Communications and Mobile Computing 2020</journal_title><author>Chu</author><cYear>2020</cYear><doi>10.1155/2020/8909458</doi><article_title>Automatic image captioning based on ResNet50 and LSTM with soft attention</article_title><unstructured_citation>Chu, Y., Yue, X., Yu, L., Sergei, M., &amp; Wang, Z. (2020). Automatic image captioning based on ResNet50 and LSTM with soft attention. Wireless Communications and Mobile Computing, 2020.</unstructured_citation></citation><citation key="ref22"><unstructured_citation>NOTE: All the pictures used in this study( Fig [1-13]) are taken from the Flickr8k dataset.</unstructured_citation></citation></citation_list>
</doi_citations>
</body>
</doi_batch>
